diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 60d0cc3f98730..fdf973d0cf1b7 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -6134,8 +6134,13 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, unsigned Unused; if (LT.second.isFixedLengthVector() && LT.second.getVectorNumElements() == Mask.size() && - (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && + (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc || + // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask + // mean that we can end up with shuffles that satisfy isTRNMask, but end + // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}). + Kind == TTI::SK_InsertSubvector) && (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) || + isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) || isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) || isREVMask(Mask, LT.second.getScalarSizeInBits(), LT.second.getVectorNumElements(), 16) || diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll index 4c4843088551a..402313d00de36 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll @@ -12,6 +12,15 @@ define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) { ret <8 x i8> %tmp0 } +define <8 x i8> @trn1.v8i8_flipped(<8 x i8> %v0, <8 x i8> %v1) { +; CHECK-LABEL: 'trn1.v8i8_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %tmp0 +; + %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> + ret <8 x i8> %tmp0 +} + define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) { ; CHECK-LABEL: 'trn2.v8i8' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> @@ -21,6 +30,15 @@ define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) { ret <8 x i8> %tmp0 } +define <8 x i8> @trn2.v8i8_flipped(<8 x i8> %v0, <8 x i8> %v1) { +; CHECK-LABEL: 'trn2.v8i8_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %tmp0 +; + %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> + ret <8 x i8> %tmp0 +} + define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) { ; CHECK-LABEL: 'trn1.v16i8' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> @@ -30,6 +48,15 @@ define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) { ret <16 x i8> %tmp0 } +define <16 x i8> @trn1.v16i8_flipped(<16 x i8> %v0, <16 x i8> %v1) { +; CHECK-LABEL: 'trn1.v16i8_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %tmp0 +; + %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> + ret <16 x i8> %tmp0 +} + define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) { ; CHECK-LABEL: 'trn2.v16i8' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> @@ -39,6 +66,15 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) { ret <16 x i8> %tmp0 } +define <16 x i8> @trn2.v16i8_flipped(<16 x i8> %v0, <16 x i8> %v1) { +; CHECK-LABEL: 'trn2.v16i8_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %tmp0 +; + %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> + ret <16 x i8> %tmp0 +} + define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) { ; CHECK-LABEL: 'trn1.v4i16' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> @@ -48,6 +84,15 @@ define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) { ret <4 x i16> %tmp0 } +define <4 x i16> @trn1.v4i16_flipped(<4 x i16> %v0, <4 x i16> %v1) { +; CHECK-LABEL: 'trn1.v4i16_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %tmp0 +; + %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> + ret <4 x i16> %tmp0 +} + define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) { ; CHECK-LABEL: 'trn2.v4i16' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> @@ -57,6 +102,15 @@ define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) { ret <4 x i16> %tmp0 } +define <4 x i16> @trn2.v4i16_flipped(<4 x i16> %v0, <4 x i16> %v1) { +; CHECK-LABEL: 'trn2.v4i16_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %tmp0 +; + %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> + ret <4 x i16> %tmp0 +} + define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) { ; CHECK-LABEL: 'trn1.v8i16' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> @@ -66,6 +120,15 @@ define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) { ret <8 x i16> %tmp0 } +define <8 x i16> @trn1.v8i16_flipped(<8 x i16> %v0, <8 x i16> %v1) { +; CHECK-LABEL: 'trn1.v8i16_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %tmp0 +; + %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> + ret <8 x i16> %tmp0 +} + define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) { ; CHECK-LABEL: 'trn2.v8i16' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> @@ -75,6 +138,15 @@ define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) { ret <8 x i16> %tmp0 } +define <8 x i16> @trn2.v8i16_flipped(<8 x i16> %v0, <8 x i16> %v1) { +; CHECK-LABEL: 'trn2.v8i16_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %tmp0 +; + %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> + ret <8 x i16> %tmp0 +} + define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: 'trn1.v2i32' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> @@ -84,6 +156,15 @@ define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) { ret <2 x i32> %tmp0 } +define <2 x i32> @trn1.v2i32_flipped(<2 x i32> %v0, <2 x i32> %v1) { +; CHECK-LABEL: 'trn1.v2i32_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %tmp0 +; + %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> + ret <2 x i32> %tmp0 +} + define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: 'trn2.v2i32' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> @@ -93,6 +174,15 @@ define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) { ret <2 x i32> %tmp0 } +define <2 x i32> @trn2.v2i32_flipped(<2 x i32> %v0, <2 x i32> %v1) { +; CHECK-LABEL: 'trn2.v2i32_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %tmp0 +; + %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> + ret <2 x i32> %tmp0 +} + define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: 'trn1.v4i32' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> @@ -102,6 +192,15 @@ define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) { ret <4 x i32> %tmp0 } +define <4 x i32> @trn1.v4i32_flipped(<4 x i32> %v0, <4 x i32> %v1) { +; CHECK-LABEL: 'trn1.v4i32_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %tmp0 +; + %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> + ret <4 x i32> %tmp0 +} + define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: 'trn2.v4i32' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> @@ -111,6 +210,15 @@ define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) { ret <4 x i32> %tmp0 } +define <4 x i32> @trn2.v4i32_flipped(<4 x i32> %v0, <4 x i32> %v1) { +; CHECK-LABEL: 'trn2.v4i32_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %tmp0 +; + %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> + ret <4 x i32> %tmp0 +} + define <2 x i64> @trn1.v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: 'trn1.v2i64' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> @@ -120,6 +228,15 @@ define <2 x i64> @trn1.v2i64(<2 x i64> %v0, <2 x i64> %v1) { ret <2 x i64> %tmp0 } +define <2 x i64> @trn1.v2i64_flipped(<2 x i64> %v0, <2 x i64> %v1) { +; CHECK-LABEL: 'trn1.v2i64_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %tmp0 +; + %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> + ret <2 x i64> %tmp0 +} + define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: 'trn2.v2i64' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> @@ -129,6 +246,15 @@ define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) { ret <2 x i64> %tmp0 } +define <2 x i64> @trn2.v2i64_flipped(<2 x i64> %v0, <2 x i64> %v1) { +; CHECK-LABEL: 'trn2.v2i64_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %tmp0 +; + %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> + ret <2 x i64> %tmp0 +} + define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) { ; CHECK-LABEL: 'trn1.v2f32' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> @@ -138,6 +264,15 @@ define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) { ret <2 x float> %tmp0 } +define <2 x float> @trn1.v2f32_flipped(<2 x float> %v0, <2 x float> %v1) { +; CHECK-LABEL: 'trn1.v2f32_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %tmp0 +; + %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> + ret <2 x float> %tmp0 +} + define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) { ; CHECK-LABEL: 'trn2.v2f32' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> @@ -147,6 +282,15 @@ define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) { ret <2 x float> %tmp0 } +define <2 x float> @trn2.v2f32_flipped(<2 x float> %v0, <2 x float> %v1) { +; CHECK-LABEL: 'trn2.v2f32_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %tmp0 +; + %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> + ret <2 x float> %tmp0 +} + define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) { ; CHECK-LABEL: 'trn1.v4f32' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> @@ -156,6 +300,15 @@ define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) { ret <4 x float> %tmp0 } +define <4 x float> @trn1.v4f32_flipped(<4 x float> %v0, <4 x float> %v1) { +; CHECK-LABEL: 'trn1.v4f32_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %tmp0 +; + %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> + ret <4 x float> %tmp0 +} + define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) { ; CHECK-LABEL: 'trn2.v4f32' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> @@ -165,6 +318,15 @@ define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) { ret <4 x float> %tmp0 } +define <4 x float> @trn2.v4f32_flipped(<4 x float> %v0, <4 x float> %v1) { +; CHECK-LABEL: 'trn2.v4f32_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %tmp0 +; + %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> + ret <4 x float> %tmp0 +} + define <2 x double> @trn1.v2f64(<2 x double> %v0, <2 x double> %v1) { ; CHECK-LABEL: 'trn1.v2f64' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> @@ -174,6 +336,15 @@ define <2 x double> @trn1.v2f64(<2 x double> %v0, <2 x double> %v1) { ret <2 x double> %tmp0 } +define <2 x double> @trn1.v2f64_flipped(<2 x double> %v0, <2 x double> %v1) { +; CHECK-LABEL: 'trn1.v2f64_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %tmp0 +; + %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> + ret <2 x double> %tmp0 +} + define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) { ; CHECK-LABEL: 'trn2.v2f64' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> @@ -183,6 +354,15 @@ define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) { ret <2 x double> %tmp0 } +define <2 x double> @trn2.v2f64_flipped(<2 x double> %v0, <2 x double> %v1) { +; CHECK-LABEL: 'trn2.v2f64_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %tmp0 +; + %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> + ret <2 x double> %tmp0 +} + define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) { ; CHECK-LABEL: 'trn1.v4f16' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> @@ -192,6 +372,15 @@ define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) { ret <4 x half> %tmp0 } +define <4 x half> @trn1.v4f16_flipped(<4 x half> %v0, <4 x half> %v1) { +; CHECK-LABEL: 'trn1.v4f16_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %tmp0 +; + %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> + ret <4 x half> %tmp0 +} + define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) { ; CHECK-LABEL: 'trn2.v4f16' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> @@ -201,6 +390,15 @@ define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) { ret <4 x half> %tmp0 } +define <4 x half> @trn2.v4f16_flipped(<4 x half> %v0, <4 x half> %v1) { +; CHECK-LABEL: 'trn2.v4f16_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %tmp0 +; + %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> + ret <4 x half> %tmp0 +} + define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) { ; CHECK-LABEL: 'trn1.v8f16' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> @@ -210,6 +408,15 @@ define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) { ret <8 x half> %tmp0 } +define <8 x half> @trn1.v8f16_flipped(<8 x half> %v0, <8 x half> %v1) { +; CHECK-LABEL: 'trn1.v8f16_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %tmp0 +; + %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> + ret <8 x half> %tmp0 +} + define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) { ; CHECK-LABEL: 'trn2.v8f16' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> @@ -219,6 +426,15 @@ define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) { ret <8 x half> %tmp0 } +define <8 x half> @trn2.v8f16_flipped(<8 x half> %v0, <8 x half> %v1) { +; CHECK-LABEL: 'trn2.v8f16_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %tmp0 +; + %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> + ret <8 x half> %tmp0 +} + define <4 x bfloat> @trn1.v4bf16(<4 x bfloat> %v0, <4 x bfloat> %v1) { ; CHECK-LABEL: 'trn1.v4bf16' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> @@ -228,6 +444,15 @@ define <4 x bfloat> @trn1.v4bf16(<4 x bfloat> %v0, <4 x bfloat> %v1) { ret <4 x bfloat> %tmp0 } +define <4 x bfloat> @trn1.v4bf16_flipped(<4 x bfloat> %v0, <4 x bfloat> %v1) { +; CHECK-LABEL: 'trn1.v4bf16_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %tmp0 +; + %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> + ret <4 x bfloat> %tmp0 +} + define <4 x bfloat> @trn2.v4bf16(<4 x bfloat> %v0, <4 x bfloat> %v1) { ; CHECK-LABEL: 'trn2.v4bf16' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> @@ -237,6 +462,15 @@ define <4 x bfloat> @trn2.v4bf16(<4 x bfloat> %v0, <4 x bfloat> %v1) { ret <4 x bfloat> %tmp0 } +define <4 x bfloat> @trn2.v4bf16_flipped(<4 x bfloat> %v0, <4 x bfloat> %v1) { +; CHECK-LABEL: 'trn2.v4bf16_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %tmp0 +; + %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> + ret <4 x bfloat> %tmp0 +} + define <8 x bfloat> @trn1.v8bf16(<8 x bfloat> %v0, <8 x bfloat> %v1) { ; CHECK-LABEL: 'trn1.v8bf16' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> @@ -246,6 +480,15 @@ define <8 x bfloat> @trn1.v8bf16(<8 x bfloat> %v0, <8 x bfloat> %v1) { ret <8 x bfloat> %tmp0 } +define <8 x bfloat> @trn1.v8bf16_flipped(<8 x bfloat> %v0, <8 x bfloat> %v1) { +; CHECK-LABEL: 'trn1.v8bf16_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %tmp0 +; + %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> + ret <8 x bfloat> %tmp0 +} + define <8 x bfloat> @trn2.v8bf16(<8 x bfloat> %v0, <8 x bfloat> %v1) { ; CHECK-LABEL: 'trn2.v8bf16' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> @@ -254,3 +497,12 @@ define <8 x bfloat> @trn2.v8bf16(<8 x bfloat> %v0, <8 x bfloat> %v1) { %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> ret <8 x bfloat> %tmp0 } + +define <8 x bfloat> @trn2.v8bf16_flipped(<8 x bfloat> %v0, <8 x bfloat> %v1) { +; CHECK-LABEL: 'trn2.v8bf16_flipped' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %tmp0 +; + %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> + ret <8 x bfloat> %tmp0 +} diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll index 10a17f7e3f9a6..7f737cd169147 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll @@ -645,19 +645,18 @@ define i1 @tryMapToRange(ptr %values, ptr %result, <2 x i64> %hi, <2 x i64> %lo) ; CHECK-NEXT: [[S1:%.*]] = sext <2 x i1> [[C1]] to <2 x i64> ; CHECK-NEXT: [[BC1:%.*]] = bitcast <2 x i64> [[S1]] to <16 x i8> ; CHECK-NEXT: [[A1:%.*]] = and <16 x i8> [[BC1]], -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[A1]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[A1]], i64 8 ; CHECK-NEXT: [[C2:%.*]] = icmp slt <2 x i64> [[L]], [[LO:%.*]] ; CHECK-NEXT: [[S2:%.*]] = sext <2 x i1> [[C2]] to <2 x i64> ; CHECK-NEXT: [[BC2:%.*]] = bitcast <2 x i64> [[S2]] to <16 x i8> ; CHECK-NEXT: [[A2:%.*]] = and <16 x i8> [[BC2]], -; CHECK-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[A2]], i64 0 -; CHECK-NEXT: [[E4:%.*]] = extractelement <16 x i8> [[A2]], i64 8 ; CHECK-NEXT: [[REASS_SUB:%.*]] = sub <2 x i64> [[L]], [[LO]] ; CHECK-NEXT: [[ADD_I_I_I_I_I_I:%.*]] = add <2 x i64> [[REASS_SUB]], splat (i64 1) ; CHECK-NEXT: store <2 x i64> [[ADD_I_I_I_I_I_I]], ptr [[RESULT:%.*]], align 8 -; CHECK-NEXT: [[O3:%.*]] = or i8 [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[O2:%.*]] = or i8 [[E4]], [[E3]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[O3:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0 +; CHECK-NEXT: [[O2:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1 ; CHECK-NEXT: [[O4:%.*]] = or i8 [[O3]], [[O2]] ; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[O4]], 0 ; CHECK-NEXT: ret i1 [[C]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-with-constants.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-with-constants.ll new file mode 100644 index 0000000000000..66461c9d91064 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-with-constants.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +define dso_local <16 x i8> @transpose_splat_constants(i8 noundef %x) { +; CHECK-LABEL: @transpose_splat_constants( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[X:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> , <16 x i8> [[TMP2]], <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP3]] +; +entry: + %0 = insertelement <16 x i8> , i8 %x, i64 0 + %1 = insertelement <16 x i8> %0, i8 %x, i64 2 + %2 = insertelement <16 x i8> %1, i8 %x, i64 4 + %3 = insertelement <16 x i8> %2, i8 %x, i64 6 + %4 = insertelement <16 x i8> %3, i8 %x, i64 8 + %5 = insertelement <16 x i8> %4, i8 %x, i64 10 + %6 = insertelement <16 x i8> %5, i8 %x, i64 12 + %7 = insertelement <16 x i8> %6, i8 %x, i64 14 + ret <16 x i8> %7 +} + +define dso_local <16 x i8> @transpose_constants_splat(i8 noundef %x) { +; CHECK-LABEL: @transpose_constants_splat( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[X:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> , <16 x i8> [[TMP2]], <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP3]] +; +entry: + %0 = insertelement <16 x i8> , i8 %x, i64 1 + %1 = insertelement <16 x i8> %0, i8 %x, i64 3 + %2 = insertelement <16 x i8> %1, i8 %x, i64 5 + %3 = insertelement <16 x i8> %2, i8 %x, i64 7 + %4 = insertelement <16 x i8> %3, i8 %x, i64 9 + %5 = insertelement <16 x i8> %4, i8 %x, i64 11 + %6 = insertelement <16 x i8> %5, i8 %x, i64 13 + %7 = insertelement <16 x i8> %6, i8 %x, i64 15 + ret <16 x i8> %7 +}