diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index cc742ab35f449..32ce34114b2f5 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -66,9 +66,15 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::umul_fix: case Intrinsic::umul_fix_sat: case Intrinsic::sqrt: // Begin floating-point. + case Intrinsic::asin: + case Intrinsic::acos: + case Intrinsic::atan: case Intrinsic::sin: case Intrinsic::cos: case Intrinsic::tan: + case Intrinsic::sinh: + case Intrinsic::cosh: + case Intrinsic::tanh: case Intrinsic::exp: case Intrinsic::exp2: case Intrinsic::log: diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll index 04289d43f40e2..c051e2f18380b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll @@ -414,6 +414,7 @@ for.end: define void @acos_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @acos_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_acosf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_acosf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_acosf(<16 x float> [[TMP4:%.*]]) @@ -487,7 +488,10 @@ for.end: define void @asin_f64_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @asin_f64_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @llvm.asin.v2f64(<2 x double> [[TMP4:%.*]]) +; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.asin.v4f64(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_asin(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.asin.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -510,6 +514,7 @@ for.end: define void @asin_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @asin_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_asinf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_asinf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_asinf(<16 x float> [[TMP4:%.*]]) @@ -588,6 +593,7 @@ define void @atan_f64_intrinsic(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_atan(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_atan(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_atan(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.atan.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -610,6 +616,7 @@ for.end: define void @atan_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @atan_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_atanf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_atanf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_atanf(<16 x float> [[TMP4:%.*]]) @@ -683,6 +690,9 @@ for.end: define void @cosh_f64_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @cosh_f64_intrinsic( ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_cosh(<2 x double> [[TMP4:%.*]]) +; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.cosh.v4f64(<4 x double> [[TMP4:%.*]]) +; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @llvm.cosh.v8f64(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.cosh.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -705,8 +715,10 @@ for.end: define void @cosh_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @cosh_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_coshf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_coshf(<8 x float> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @llvm.cosh.v16f32(<16 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -754,6 +766,7 @@ for.end: define void @tanh_f32_intrinsic(ptr nocapture %varray) { ; CHECK-LABEL: @tanh_f32_intrinsic( +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanhf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_tanhf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_tanhf(<16 x float> [[TMP4:%.*]]) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll index 809059034c7f9..da827a5b674d8 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll @@ -627,13 +627,11 @@ define <4 x float> @int_asin_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -708,13 +706,11 @@ define <4 x float> @int_acos_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -789,13 +785,11 @@ define <4 x float> @int_atan_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -870,13 +864,11 @@ define <4 x float> @int_sinh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -951,13 +943,11 @@ define <4 x float> @int_cosh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -1032,13 +1022,11 @@ define <4 x float> @int_tanh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll index 36633a1053b14..62b8c0ce3291a 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll @@ -627,13 +627,11 @@ define <4 x float> @int_asin_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -708,13 +706,11 @@ define <4 x float> @int_acos_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -789,13 +785,11 @@ define <4 x float> @int_atan_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -870,13 +864,11 @@ define <4 x float> @int_sinh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -951,13 +943,11 @@ define <4 x float> @int_cosh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -1032,13 +1022,11 @@ define <4 x float> @int_tanh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16