Skip to content

Commit

Permalink
[X86] X86FixupVectorConstantsPass - attempt to match VEX logic ops ba…
Browse files Browse the repository at this point in the history
…ck to EVEX if we can create a broadcast fold

On non-DQI AVX512 targets, X86InstrInfo::setExecutionDomainCustom will convert EVEX int-domain instructions to VEX fp-domain instructions. But, if we have the chance to use a broadcast fold we're better off using a EVEX instruction, so handle a reverse fold.
  • Loading branch information
RKSimon committed Nov 21, 2023
1 parent f802fed commit 1552b91
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 23 deletions.
89 changes: 78 additions & 11 deletions llvm/lib/Target/X86/X86FixupVectorConstants.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
bool HasAVX2 = ST->hasAVX2();
bool HasDQI = ST->hasDQI();
bool HasBWI = ST->hasBWI();
bool HasVLX = ST->hasVLX();

auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128,
unsigned OpBcst64, unsigned OpBcst32,
Expand Down Expand Up @@ -352,20 +353,22 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
1);
}

// Attempt to find a AVX512 mapping from a full width memory-fold instruction
// to a broadcast-fold instruction variant.
if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX) {
auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {
unsigned OpBcst32 = 0, OpBcst64 = 0;
unsigned OpNoBcst32 = 0, OpNoBcst64 = 0;
if (const X86MemoryFoldTableEntry *Mem2Bcst =
llvm::lookupBroadcastFoldTable(Opc, 32)) {
OpBcst32 = Mem2Bcst->DstOp;
OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
if (OpSrc32) {
if (const X86MemoryFoldTableEntry *Mem2Bcst =
llvm::lookupBroadcastFoldTable(OpSrc32, 32)) {
OpBcst32 = Mem2Bcst->DstOp;
OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
}
}
if (const X86MemoryFoldTableEntry *Mem2Bcst =
llvm::lookupBroadcastFoldTable(Opc, 64)) {
OpBcst64 = Mem2Bcst->DstOp;
OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
if (OpSrc64) {
if (const X86MemoryFoldTableEntry *Mem2Bcst =
llvm::lookupBroadcastFoldTable(OpSrc64, 64)) {
OpBcst64 = Mem2Bcst->DstOp;
OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
}
}
assert(((OpBcst32 == 0) || (OpBcst64 == 0) || (OpNoBcst32 == OpNoBcst64)) &&
"OperandNo mismatch");
Expand All @@ -374,6 +377,70 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
return ConvertToBroadcast(0, 0, OpBcst64, OpBcst32, 0, 0, OpNo);
}
return false;
};

// Attempt to find a AVX512 mapping from a full width memory-fold instruction
// to a broadcast-fold instruction variant.
if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX)
return ConvertToBroadcastAVX512(Opc, Opc);

// Reverse the X86InstrInfo::setExecutionDomainCustom EVEX->VEX logic
// conversion to see if we can convert to a broadcasted (integer) logic op.
if (HasVLX && !HasDQI) {
unsigned OpSrc32 = 0, OpSrc64 = 0;
switch (Opc) {
case X86::VANDPDrm:
case X86::VANDPSrm:
case X86::VPANDrm:
OpSrc32 = X86 ::VPANDDZ128rm;
OpSrc64 = X86 ::VPANDQZ128rm;
break;
case X86::VANDPDYrm:
case X86::VANDPSYrm:
case X86::VPANDYrm:
OpSrc32 = X86 ::VPANDDZ256rm;
OpSrc64 = X86 ::VPANDQZ256rm;
break;
case X86::VANDNPDrm:
case X86::VANDNPSrm:
case X86::VPANDNrm:
OpSrc32 = X86 ::VPANDNDZ128rm;
OpSrc64 = X86 ::VPANDNQZ128rm;
break;
case X86::VANDNPDYrm:
case X86::VANDNPSYrm:
case X86::VPANDNYrm:
OpSrc32 = X86 ::VPANDNDZ256rm;
OpSrc64 = X86 ::VPANDNQZ256rm;
break;
case X86::VORPDrm:
case X86::VORPSrm:
case X86::VPORrm:
OpSrc32 = X86 ::VPORDZ128rm;
OpSrc64 = X86 ::VPORQZ128rm;
break;
case X86::VORPDYrm:
case X86::VORPSYrm:
case X86::VPORYrm:
OpSrc32 = X86 ::VPORDZ256rm;
OpSrc64 = X86 ::VPORQZ256rm;
break;
case X86::VXORPDrm:
case X86::VXORPSrm:
case X86::VPXORrm:
OpSrc32 = X86 ::VPXORDZ128rm;
OpSrc64 = X86 ::VPXORQZ128rm;
break;
case X86::VXORPDYrm:
case X86::VXORPSYrm:
case X86::VPXORYrm:
OpSrc32 = X86 ::VPXORDZ256rm;
OpSrc64 = X86 ::VPXORQZ256rm;
break;
}
if (OpSrc32 || OpSrc64)
return ConvertToBroadcastAVX512(OpSrc32, OpSrc64);
}

return false;
Expand Down
18 changes: 14 additions & 4 deletions llvm/test/CodeGen/X86/combine-abs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,20 @@ define <16 x i8> @combine_v16i8_abs_constant(<16 x i8> %a) {
; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_v16i8_abs_constant:
; AVX: # %bb.0:
; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
; AVX2-LABEL: combine_v16i8_abs_constant:
; AVX2: # %bb.0:
; AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: combine_v16i8_abs_constant:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: combine_v16i8_abs_constant:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; AVX512VL-NEXT: retq
%1 = insertelement <16 x i8> undef, i8 15, i32 0
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
%3 = and <16 x i8> %a, %2
Expand Down
13 changes: 9 additions & 4 deletions llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -711,10 +711,15 @@ define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(
; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
; AVX: # %bb.0:
; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
ret <16 x i8> %shuffle
}
Expand Down
23 changes: 19 additions & 4 deletions llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2362,10 +2362,25 @@ define <32 x i8> @load_fold_pblendvb_commute(ptr %px, <32 x i8> %y) {
}

define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) {
; ALL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
; ALL: # %bb.0:
; ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; ALL-NEXT: retq
; AVX1-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
; AVX2: # %bb.0:
; AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; XOP-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
; XOP: # %bb.0:
; XOP-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; XOP-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
ret <32 x i8> %shuffle
}
Expand Down

0 comments on commit 1552b91

Please sign in to comment.