Skip to content

Commit acbc5ed

Browse files
committed
[X86][SSE] getFauxShuffle - support insert(truncate/extend(extract(vec0,c0)),vec1,c1) shuffle patterns at the byte level
Followup to the PR45604 fix at rGe71dd7c011a3 where we disabled most of these cases. By creating the shuffle at the byte level we can handle any extension/truncation as long as we track how small the scalar got and assume that the upper bytes will need to be zero.
1 parent 33f043c commit acbc5ed

File tree

5 files changed

+142
-81
lines changed

5 files changed

+142
-81
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7462,16 +7462,18 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
74627462
}
74637463

74647464
// Peek through trunc/aext/zext.
7465-
// TODO: handle elements smaller than VT.
74667465
// TODO: aext shouldn't require SM_SentinelZero padding.
74677466
// TODO: handle shift of scalars.
7467+
unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
74687468
while (Scl.getOpcode() == ISD::TRUNCATE ||
74697469
Scl.getOpcode() == ISD::ANY_EXTEND ||
74707470
Scl.getOpcode() == ISD::ZERO_EXTEND) {
74717471
Scl = Scl.getOperand(0);
7472-
if (Scl.getScalarValueSizeInBits() < NumBitsPerElt)
7473-
return false;
7472+
if (MinBitsPerElt > Scl.getScalarValueSizeInBits())
7473+
MinBitsPerElt = Scl.getScalarValueSizeInBits();
74747474
}
7475+
if ((MinBitsPerElt % 8) != 0)
7476+
return false;
74757477

74767478
// Attempt to find the source vector the scalar was extracted from.
74777479
SDValue SrcExtract;
@@ -7486,31 +7488,29 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
74867488

74877489
SDValue SrcVec = SrcExtract.getOperand(0);
74887490
EVT SrcVT = SrcVec.getValueType();
7489-
unsigned NumSrcElts = SrcVT.getVectorNumElements();
7490-
unsigned NumZeros =
7491-
std::max<int>((NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1, 0);
7492-
7493-
if ((NumSrcElts % NumElts) != 0)
7491+
if (!SrcVT.getScalarType().isByteSized())
74947492
return false;
7495-
74967493
unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
7497-
if (NumSrcElts <= SrcIdx)
7498-
return false;
7494+
unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
7495+
unsigned DstByte = DstIdx * NumBytesPerElt;
74997496

7497+
// Create 'identity' byte level shuffle mask and then add inserted bytes.
75007498
if (Opcode == ISD::SCALAR_TO_VECTOR) {
75017499
Ops.push_back(SrcVec);
7502-
Mask.append(NumSrcElts, SM_SentinelUndef);
7500+
Mask.append(NumSizeInBytes, SM_SentinelUndef);
75037501
} else {
75047502
Ops.push_back(SrcVec);
75057503
Ops.push_back(N.getOperand(0));
7506-
for (int i = 0; i != (int)NumSrcElts; ++i)
7507-
Mask.push_back(NumSrcElts + i);
7504+
for (int i = 0; i != (int)NumSizeInBytes; ++i)
7505+
Mask.push_back(NumSizeInBytes + i);
75087506
}
75097507

7510-
int Scale = NumSrcElts / NumElts;
7511-
Mask[Scale * DstIdx] = SrcIdx;
7512-
for (int i = 0; i != (int)NumZeros; ++i)
7513-
Mask[(Scale * DstIdx) + i + 1] = SM_SentinelZero;
7508+
unsigned MinBytesPerElts = MinBitsPerElt / 8;
7509+
MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
7510+
for (unsigned i = 0; i != MinBytesPerElts; ++i)
7511+
Mask[DstByte + i] = SrcByte + i;
7512+
for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
7513+
Mask[DstByte + i] = SM_SentinelZero;
75147514
return true;
75157515
}
75167516
case X86ISD::PACKSS:

llvm/test/CodeGen/X86/buildvec-extract.ll

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -293,24 +293,19 @@ define <2 x i64> @extract2_i32_zext_insert1_i64_undef(<4 x i32> %x) {
293293
define <2 x i64> @extract2_i32_zext_insert1_i64_zero(<4 x i32> %x) {
294294
; SSE2-LABEL: extract2_i32_zext_insert1_i64_zero:
295295
; SSE2: # %bb.0:
296-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
297-
; SSE2-NEXT: movd %xmm0, %eax
298-
; SSE2-NEXT: movq %rax, %xmm0
299-
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
296+
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
300297
; SSE2-NEXT: retq
301298
;
302299
; SSE41-LABEL: extract2_i32_zext_insert1_i64_zero:
303300
; SSE41: # %bb.0:
304-
; SSE41-NEXT: extractps $2, %xmm0, %eax
305-
; SSE41-NEXT: movq %rax, %xmm0
306-
; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
301+
; SSE41-NEXT: xorps %xmm1, %xmm1
302+
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
307303
; SSE41-NEXT: retq
308304
;
309305
; AVX-LABEL: extract2_i32_zext_insert1_i64_zero:
310306
; AVX: # %bb.0:
311-
; AVX-NEXT: vextractps $2, %xmm0, %eax
312-
; AVX-NEXT: vmovq %rax, %xmm0
313-
; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
307+
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
308+
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
314309
; AVX-NEXT: retq
315310
%e = extractelement <4 x i32> %x, i32 2
316311
%z = zext i32 %e to i64
@@ -386,16 +381,22 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) {
386381
}
387382

388383
define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) {
389-
; SSE-LABEL: extract0_i16_zext_insert0_i64_zero:
390-
; SSE: # %bb.0:
391-
; SSE-NEXT: pextrw $0, %xmm0, %eax
392-
; SSE-NEXT: movd %eax, %xmm0
393-
; SSE-NEXT: retq
384+
; SSE2-LABEL: extract0_i16_zext_insert0_i64_zero:
385+
; SSE2: # %bb.0:
386+
; SSE2-NEXT: pextrw $0, %xmm0, %eax
387+
; SSE2-NEXT: movd %eax, %xmm0
388+
; SSE2-NEXT: retq
389+
;
390+
; SSE41-LABEL: extract0_i16_zext_insert0_i64_zero:
391+
; SSE41: # %bb.0:
392+
; SSE41-NEXT: pxor %xmm1, %xmm1
393+
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
394+
; SSE41-NEXT: retq
394395
;
395396
; AVX-LABEL: extract0_i16_zext_insert0_i64_zero:
396397
; AVX: # %bb.0:
397-
; AVX-NEXT: vpextrw $0, %xmm0, %eax
398-
; AVX-NEXT: vmovd %eax, %xmm0
398+
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
399+
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
399400
; AVX-NEXT: retq
400401
%e = extractelement <8 x i16> %x, i32 0
401402
%z = zext i16 %e to i64

llvm/test/CodeGen/X86/buildvec-insertvec.ll

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,7 @@ define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
2121
; SSE41-LABEL: foo:
2222
; SSE41: # %bb.0:
2323
; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
24-
; SSE41-NEXT: pextrb $8, %xmm0, %eax
25-
; SSE41-NEXT: pextrb $4, %xmm0, %ecx
26-
; SSE41-NEXT: pinsrb $1, %ecx, %xmm0
27-
; SSE41-NEXT: pinsrb $2, %eax, %xmm0
24+
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
2825
; SSE41-NEXT: movl $255, %eax
2926
; SSE41-NEXT: pinsrb $3, %eax, %xmm0
3027
; SSE41-NEXT: movd %xmm0, (%rdi)

llvm/test/CodeGen/X86/extract-concat.ll

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,7 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) {
2424
; SSE42-LABEL: foo:
2525
; SSE42: # %bb.0:
2626
; SSE42-NEXT: cvttps2dq %xmm0, %xmm0
27-
; SSE42-NEXT: pextrb $8, %xmm0, %eax
28-
; SSE42-NEXT: pextrb $4, %xmm0, %ecx
29-
; SSE42-NEXT: pinsrb $1, %ecx, %xmm0
30-
; SSE42-NEXT: pinsrb $2, %eax, %xmm0
27+
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
3128
; SSE42-NEXT: movl $255, %eax
3229
; SSE42-NEXT: pinsrb $3, %eax, %xmm0
3330
; SSE42-NEXT: movd %xmm0, (%rdi)
@@ -36,10 +33,7 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) {
3633
; AVX-LABEL: foo:
3734
; AVX: # %bb.0:
3835
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
39-
; AVX-NEXT: vpextrb $8, %xmm0, %eax
40-
; AVX-NEXT: vpextrb $4, %xmm0, %ecx
41-
; AVX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
42-
; AVX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
36+
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
4337
; AVX-NEXT: movl $255, %eax
4438
; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
4539
; AVX-NEXT: vmovd %xmm0, (%rdi)

llvm/test/CodeGen/X86/vector-shuffle-combining.ll

Lines changed: 103 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3028,40 +3028,109 @@ define void @PR43024() {
30283028
}
30293029

30303030
define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) {
3031-
; SSE-LABEL: PR45604:
3032-
; SSE: # %bb.0:
3033-
; SSE-NEXT: movdqa (%rsi), %xmm1
3034-
; SSE-NEXT: movd %xmm1, %eax
3035-
; SSE-NEXT: movzwl %ax, %eax
3036-
; SSE-NEXT: movd %eax, %xmm0
3037-
; SSE-NEXT: movl $11, %eax
3038-
; SSE-NEXT: pinsrw $2, %eax, %xmm0
3039-
; SSE-NEXT: pextrw $1, %xmm1, %ecx
3040-
; SSE-NEXT: pinsrw $4, %ecx, %xmm0
3041-
; SSE-NEXT: pinsrw $6, %eax, %xmm0
3042-
; SSE-NEXT: pextrw $2, %xmm1, %ecx
3043-
; SSE-NEXT: movd %ecx, %xmm2
3044-
; SSE-NEXT: pinsrw $2, %eax, %xmm2
3045-
; SSE-NEXT: pextrw $3, %xmm1, %ecx
3046-
; SSE-NEXT: pinsrw $4, %ecx, %xmm2
3047-
; SSE-NEXT: pinsrw $6, %eax, %xmm2
3048-
; SSE-NEXT: pextrw $4, %xmm1, %ecx
3049-
; SSE-NEXT: movd %ecx, %xmm3
3050-
; SSE-NEXT: pinsrw $2, %eax, %xmm3
3051-
; SSE-NEXT: pextrw $5, %xmm1, %ecx
3052-
; SSE-NEXT: pinsrw $4, %ecx, %xmm3
3053-
; SSE-NEXT: pinsrw $6, %eax, %xmm3
3054-
; SSE-NEXT: pextrw $6, %xmm1, %ecx
3055-
; SSE-NEXT: movd %ecx, %xmm4
3056-
; SSE-NEXT: pinsrw $2, %eax, %xmm4
3057-
; SSE-NEXT: pextrw $7, %xmm1, %ecx
3058-
; SSE-NEXT: pinsrw $4, %ecx, %xmm4
3059-
; SSE-NEXT: pinsrw $6, %eax, %xmm4
3060-
; SSE-NEXT: movdqa %xmm4, 48(%rdi)
3061-
; SSE-NEXT: movdqa %xmm3, 32(%rdi)
3062-
; SSE-NEXT: movdqa %xmm2, 16(%rdi)
3063-
; SSE-NEXT: movdqa %xmm0, (%rdi)
3064-
; SSE-NEXT: retq
3031+
; SSE2-LABEL: PR45604:
3032+
; SSE2: # %bb.0:
3033+
; SSE2-NEXT: movdqa (%rsi), %xmm1
3034+
; SSE2-NEXT: movd %xmm1, %eax
3035+
; SSE2-NEXT: movzwl %ax, %eax
3036+
; SSE2-NEXT: movd %eax, %xmm0
3037+
; SSE2-NEXT: movl $11, %eax
3038+
; SSE2-NEXT: pinsrw $2, %eax, %xmm0
3039+
; SSE2-NEXT: pextrw $1, %xmm1, %ecx
3040+
; SSE2-NEXT: pinsrw $4, %ecx, %xmm0
3041+
; SSE2-NEXT: pinsrw $6, %eax, %xmm0
3042+
; SSE2-NEXT: pextrw $2, %xmm1, %ecx
3043+
; SSE2-NEXT: movd %ecx, %xmm2
3044+
; SSE2-NEXT: pinsrw $2, %eax, %xmm2
3045+
; SSE2-NEXT: pextrw $3, %xmm1, %ecx
3046+
; SSE2-NEXT: pinsrw $4, %ecx, %xmm2
3047+
; SSE2-NEXT: pinsrw $6, %eax, %xmm2
3048+
; SSE2-NEXT: pextrw $4, %xmm1, %ecx
3049+
; SSE2-NEXT: movd %ecx, %xmm3
3050+
; SSE2-NEXT: pinsrw $2, %eax, %xmm3
3051+
; SSE2-NEXT: pextrw $5, %xmm1, %ecx
3052+
; SSE2-NEXT: pinsrw $4, %ecx, %xmm3
3053+
; SSE2-NEXT: pinsrw $6, %eax, %xmm3
3054+
; SSE2-NEXT: pextrw $6, %xmm1, %ecx
3055+
; SSE2-NEXT: movd %ecx, %xmm4
3056+
; SSE2-NEXT: pinsrw $2, %eax, %xmm4
3057+
; SSE2-NEXT: pextrw $7, %xmm1, %ecx
3058+
; SSE2-NEXT: pinsrw $4, %ecx, %xmm4
3059+
; SSE2-NEXT: pinsrw $6, %eax, %xmm4
3060+
; SSE2-NEXT: movdqa %xmm4, 48(%rdi)
3061+
; SSE2-NEXT: movdqa %xmm3, 32(%rdi)
3062+
; SSE2-NEXT: movdqa %xmm2, 16(%rdi)
3063+
; SSE2-NEXT: movdqa %xmm0, (%rdi)
3064+
; SSE2-NEXT: retq
3065+
;
3066+
; SSSE3-LABEL: PR45604:
3067+
; SSSE3: # %bb.0:
3068+
; SSSE3-NEXT: movdqa (%rsi), %xmm1
3069+
; SSSE3-NEXT: movd %xmm1, %eax
3070+
; SSSE3-NEXT: movzwl %ax, %eax
3071+
; SSSE3-NEXT: movd %eax, %xmm0
3072+
; SSSE3-NEXT: movl $11, %eax
3073+
; SSSE3-NEXT: pinsrw $2, %eax, %xmm0
3074+
; SSSE3-NEXT: pextrw $1, %xmm1, %ecx
3075+
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
3076+
; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
3077+
; SSSE3-NEXT: pextrw $2, %xmm1, %ecx
3078+
; SSSE3-NEXT: movd %ecx, %xmm2
3079+
; SSSE3-NEXT: pinsrw $2, %eax, %xmm2
3080+
; SSSE3-NEXT: pextrw $3, %xmm1, %ecx
3081+
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm2
3082+
; SSSE3-NEXT: pinsrw $6, %eax, %xmm2
3083+
; SSSE3-NEXT: pextrw $4, %xmm1, %ecx
3084+
; SSSE3-NEXT: movd %ecx, %xmm3
3085+
; SSSE3-NEXT: pinsrw $2, %eax, %xmm3
3086+
; SSSE3-NEXT: pextrw $5, %xmm1, %ecx
3087+
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm3
3088+
; SSSE3-NEXT: pinsrw $6, %eax, %xmm3
3089+
; SSSE3-NEXT: pextrw $6, %xmm1, %ecx
3090+
; SSSE3-NEXT: movd %ecx, %xmm4
3091+
; SSSE3-NEXT: pinsrw $2, %eax, %xmm4
3092+
; SSSE3-NEXT: pextrw $7, %xmm1, %ecx
3093+
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm4
3094+
; SSSE3-NEXT: pinsrw $6, %eax, %xmm4
3095+
; SSSE3-NEXT: movdqa %xmm4, 48(%rdi)
3096+
; SSSE3-NEXT: movdqa %xmm3, 32(%rdi)
3097+
; SSSE3-NEXT: movdqa %xmm2, 16(%rdi)
3098+
; SSSE3-NEXT: movdqa %xmm0, (%rdi)
3099+
; SSSE3-NEXT: retq
3100+
;
3101+
; SSE41-LABEL: PR45604:
3102+
; SSE41: # %bb.0:
3103+
; SSE41-NEXT: movdqa (%rsi), %xmm1
3104+
; SSE41-NEXT: pextrw $2, %xmm1, %eax
3105+
; SSE41-NEXT: movd %eax, %xmm0
3106+
; SSE41-NEXT: movl $11, %eax
3107+
; SSE41-NEXT: pinsrw $2, %eax, %xmm0
3108+
; SSE41-NEXT: pextrw $3, %xmm1, %ecx
3109+
; SSE41-NEXT: pinsrw $4, %ecx, %xmm0
3110+
; SSE41-NEXT: pinsrw $6, %eax, %xmm0
3111+
; SSE41-NEXT: pextrw $4, %xmm1, %ecx
3112+
; SSE41-NEXT: movd %ecx, %xmm2
3113+
; SSE41-NEXT: pinsrw $2, %eax, %xmm2
3114+
; SSE41-NEXT: pextrw $5, %xmm1, %ecx
3115+
; SSE41-NEXT: pinsrw $4, %ecx, %xmm2
3116+
; SSE41-NEXT: pinsrw $6, %eax, %xmm2
3117+
; SSE41-NEXT: pextrw $6, %xmm1, %ecx
3118+
; SSE41-NEXT: movd %ecx, %xmm3
3119+
; SSE41-NEXT: pinsrw $2, %eax, %xmm3
3120+
; SSE41-NEXT: pextrw $7, %xmm1, %ecx
3121+
; SSE41-NEXT: pinsrw $4, %ecx, %xmm3
3122+
; SSE41-NEXT: pinsrw $6, %eax, %xmm3
3123+
; SSE41-NEXT: pxor %xmm4, %xmm4
3124+
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6,7]
3125+
; SSE41-NEXT: pinsrw $2, %eax, %xmm4
3126+
; SSE41-NEXT: pextrw $1, %xmm1, %ecx
3127+
; SSE41-NEXT: pinsrw $4, %ecx, %xmm4
3128+
; SSE41-NEXT: pinsrw $6, %eax, %xmm4
3129+
; SSE41-NEXT: movdqa %xmm4, (%rdi)
3130+
; SSE41-NEXT: movdqa %xmm3, 48(%rdi)
3131+
; SSE41-NEXT: movdqa %xmm2, 32(%rdi)
3132+
; SSE41-NEXT: movdqa %xmm0, 16(%rdi)
3133+
; SSE41-NEXT: retq
30653134
;
30663135
; AVX1-LABEL: PR45604:
30673136
; AVX1: # %bb.0:

0 commit comments

Comments
 (0)