diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e779a3d3dab6c..19510bbba0317 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2405,6 +2405,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); } + if (Subtarget.hasAVX512()) { + for (MVT VT : { MVT::f16, MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64 }) + setOperationAction(ISD::FLDEXP, VT, Custom); + + if (Subtarget.hasVLX()) + for (MVT VT : { MVT::v8f32, MVT::v4f64, MVT::v16f32, MVT::v8f64 }) + setOperationAction(ISD::FLDEXP, VT, Custom); + } + // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` // is. We should promote the value to 64-bits to solve this. // This is what the CRT headers do - `fmodf` is an inline header @@ -31814,6 +31823,57 @@ static StringRef getInstrStrFromOpNo(const SmallVectorImpl &AsmStrs, return StringRef(); } +static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDValue X = Op.getOperand(0); + EVT XTy = X.getValueType(); + SDValue Exp = Op.getOperand(1); + SDLoc DL(Op); + EVT XVT, ExpVT; + switch (Op.getSimpleValueType().SimpleTy) { + default: + return SDValue(); + case MVT::f16: + // TODO: Choose vscalefph when fp16 for ISD::FLDEXP is fully supported. + X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X); + [[fallthrough]]; + case MVT::f32: + XVT = MVT::v4f32; + ExpVT = MVT::v4f32; + break; + case MVT::f64: + XVT = MVT::v2f64; + ExpVT = MVT::v2f64; + break; + case MVT::v4f32: + case MVT::v2f64: + if (!Subtarget.hasVLX()) { + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEFS, DL, XTy, X, Exp, X); + } + case MVT::v8f32: + case MVT::v4f64: + case MVT::v16f32: + case MVT::v8f64: + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); + } + + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); + SDValue VX = + DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero); + SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT, + DAG.getUNDEF(ExpVT), Exp, Zero); + SDValue Scalef = DAG.getNode(X86ISD::SCALEFS, DL, XVT, VX, VExp, VX); + SDValue Final = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), Scalef, Zero); + if (X.getValueType() != XTy) + Final = DAG.getNode(ISD::FP_ROUND, DL, XTy, Final, + DAG.getIntPtrConstant(1, SDLoc(Op))); + return Final; +} + bool X86TargetLowering::isInlineAsmTargetBranch( const SmallVectorImpl &AsmStrs, unsigned OpNo) const { // In a __asm block, __asm inst foo where inst is CALL or JMP should be @@ -31979,6 +32039,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG); case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG); case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG); + case ISD::FLDEXP: return LowerFLDEXP(Op, Subtarget, DAG); } } diff --git a/llvm/test/CodeGen/X86/call-ldexp.ll b/llvm/test/CodeGen/X86/call-ldexp.ll new file mode 100644 index 0000000000000..5137e7a64a446 --- /dev/null +++ b/llvm/test/CodeGen/X86/call-ldexp.ll @@ -0,0 +1,361 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=x86_64 -mattr=+avx512f < %s -o - | FileCheck %s --check-prefixes=AVX512 +; RUN: llc -mtriple=x86_64 -mattr=+avx512vl < %s -o - | FileCheck %s --check-prefixes=AVX512VL + +define half @test_half(half %x, i32 %exp) { +; AVX512-LABEL: test_half: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrw $0, %xmm0, %eax +; AVX512-NEXT: vcvtsi2ss %edi, %xmm1, %xmm0 +; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vscalefss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_half: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpextrw $0, %xmm0, %eax +; AVX512VL-NEXT: vcvtsi2ss %edi, %xmm1, %xmm0 +; AVX512VL-NEXT: movzwl %ax, %eax +; AVX512VL-NEXT: vmovd %eax, %xmm1 +; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512VL-NEXT: vscalefss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +entry: + %r = tail call fast half @llvm.ldexp.f16.i32(half %x, i32 %exp) + ret half %r +} + +declare half @llvm.ldexp.f16.i32(half, i32) memory(none) + +define float @test_float(float %x, i32 %exp) { +; AVX512-LABEL: test_float: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtsi2ss %edi, %xmm1, %xmm1 +; AVX512-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_float: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vcvtsi2ss %edi, %xmm1, %xmm1 +; AVX512VL-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +entry: + %r = tail call fast float @ldexpf(float %x, i32 %exp) + ret float %r +} + +declare float @ldexpf(float, i32) memory(none) + +define double @test_double(double %x, i32 %exp) { +; AVX512-LABEL: test_double: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtsi2sd %edi, %xmm1, %xmm1 +; AVX512-NEXT: vscalefsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_double: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vcvtsi2sd %edi, %xmm1, %xmm1 +; AVX512VL-NEXT: vscalefsd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +entry: + %r = tail call fast double @ldexp(double %x, i32 %exp) + ret double %r +} + +declare double @ldexp(double, i32) memory(none) + +define fp128 @testExpl(fp128 %x, i32 %exp) { +; AVX512-LABEL: testExpl: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: jmp ldexpl@PLT # TAILCALL +; +; AVX512VL-LABEL: testExpl: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: jmp ldexpl@PLT # TAILCALL +entry: + %r = tail call fast fp128 @ldexpl(fp128 %x, i32 %exp) + ret fp128 %r +} + +declare fp128 @ldexpl(fp128, i32) memory(none) + +define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) { +; AVX512-LABEL: test_ldexp_4xfloat: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX512-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_4xfloat: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX512VL-NEXT: vscalefps %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq + %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %x, <4 x i32> %exp) + ret <4 x float> %r +} +declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>) + +define <2 x double> @test_ldexp_2xdouble(<2 x double> %x, <2 x i32> %exp) { +; AVX512-LABEL: test_ldexp_2xdouble: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtdq2pd %xmm1, %xmm2 +; AVX512-NEXT: vscalefsd %xmm2, %xmm0, %xmm2 +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512-NEXT: vcvtdq2pd %xmm1, %xmm1 +; AVX512-NEXT: vscalefsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_2xdouble: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtdq2pd %xmm1, %xmm2 +; AVX512VL-NEXT: vscalefsd %xmm2, %xmm0, %xmm2 +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512VL-NEXT: vcvtdq2pd %xmm1, %xmm1 +; AVX512VL-NEXT: vscalefsd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512VL-NEXT: retq + %r = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> %x, <2 x i32> %exp) + ret <2 x double> %r +} +declare <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>) + +define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) { +; AVX512-LABEL: test_ldexp_8xfloat: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX512-NEXT: vcvtdq2ps %xmm3, %xmm4 +; AVX512-NEXT: vscalefss %xmm4, %xmm2, %xmm4 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm6 = xmm3[1,1,1,1] +; AVX512-NEXT: vcvtdq2ps %xmm6, %xmm6 +; AVX512-NEXT: vscalefss %xmm6, %xmm5, %xmm5 +; AVX512-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm2[1,0] +; AVX512-NEXT: vshufps {{.*#+}} xmm6 = xmm3[2,3,2,3] +; AVX512-NEXT: vcvtdq2ps %xmm6, %xmm6 +; AVX512-NEXT: vscalefss %xmm6, %xmm5, %xmm5 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX512-NEXT: vcvtdq2ps %xmm3, %xmm3 +; AVX512-NEXT: vscalefss %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[0] +; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm3 +; AVX512-NEXT: vscalefss %xmm3, %xmm0, %xmm3 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm5 = xmm1[1,1,1,1] +; AVX512-NEXT: vcvtdq2ps %xmm5, %xmm5 +; AVX512-NEXT: vscalefss %xmm5, %xmm4, %xmm4 +; AVX512-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX512-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512-NEXT: vshufps {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX512-NEXT: vcvtdq2ps %xmm5, %xmm5 +; AVX512-NEXT: vscalefss %xmm5, %xmm4, %xmm4 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX512-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_8xfloat: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX512VL-NEXT: vscalefps %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq + %r = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> %x, <8 x i32> %exp) + ret <8 x float> %r +} +declare <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float>, <8 x i32>) + +define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) { +; AVX512-LABEL: test_ldexp_4xdouble: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX512-NEXT: vcvtdq2pd %xmm3, %xmm3 +; AVX512-NEXT: vscalefsd %xmm3, %xmm2, %xmm3 +; AVX512-NEXT: vcvtdq2pd %xmm1, %xmm4 +; AVX512-NEXT: vscalefsd %xmm4, %xmm0, %xmm4 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX512-NEXT: vshufps {{.*#+}} xmm4 = xmm1[3,3,3,3] +; AVX512-NEXT: vcvtdq2pd %xmm4, %xmm4 +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vscalefsd %xmm4, %xmm2, %xmm2 +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512-NEXT: vcvtdq2pd %xmm1, %xmm1 +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vscalefsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_4xdouble: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX512VL-NEXT: vscalefpd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq + %r = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> %x, <4 x i32> %exp) + ret <4 x double> %r +} +declare <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>) + +define <16 x float> @test_ldexp_16xfloat(<16 x float> %x, <16 x i32> %exp) { +; AVX512-LABEL: test_ldexp_16xfloat: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm3 +; AVX512-NEXT: vcvtdq2ps %xmm3, %xmm4 +; AVX512-NEXT: vscalefss %xmm4, %xmm2, %xmm4 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm6 = xmm3[1,1,1,1] +; AVX512-NEXT: vcvtdq2ps %xmm6, %xmm6 +; AVX512-NEXT: vscalefss %xmm6, %xmm5, %xmm5 +; AVX512-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm2[1,0] +; AVX512-NEXT: vshufps {{.*#+}} xmm6 = xmm3[2,3,2,3] +; AVX512-NEXT: vcvtdq2ps %xmm6, %xmm6 +; AVX512-NEXT: vscalefss %xmm6, %xmm5, %xmm5 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX512-NEXT: vcvtdq2ps %xmm3, %xmm3 +; AVX512-NEXT: vscalefss %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[0] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm4 +; AVX512-NEXT: vcvtdq2ps %xmm4, %xmm5 +; AVX512-NEXT: vscalefss %xmm5, %xmm3, %xmm5 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm3[1,1,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm7 = xmm4[1,1,1,1] +; AVX512-NEXT: vcvtdq2ps %xmm7, %xmm7 +; AVX512-NEXT: vscalefss %xmm7, %xmm6, %xmm6 +; AVX512-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX512-NEXT: vshufpd {{.*#+}} xmm6 = xmm3[1,0] +; AVX512-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,3,2,3] +; AVX512-NEXT: vcvtdq2ps %xmm7, %xmm7 +; AVX512-NEXT: vscalefss %xmm7, %xmm6, %xmm6 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX512-NEXT: vcvtdq2ps %xmm4, %xmm4 +; AVX512-NEXT: vscalefss %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[0] +; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX512-NEXT: vcvtdq2ps %xmm4, %xmm5 +; AVX512-NEXT: vscalefss %xmm5, %xmm3, %xmm5 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm3[1,1,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm7 = xmm4[1,1,1,1] +; AVX512-NEXT: vcvtdq2ps %xmm7, %xmm7 +; AVX512-NEXT: vscalefss %xmm7, %xmm6, %xmm6 +; AVX512-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX512-NEXT: vshufpd {{.*#+}} xmm6 = xmm3[1,0] +; AVX512-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,3,2,3] +; AVX512-NEXT: vcvtdq2ps %xmm7, %xmm7 +; AVX512-NEXT: vscalefss %xmm7, %xmm6, %xmm6 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX512-NEXT: vcvtdq2ps %xmm4, %xmm4 +; AVX512-NEXT: vscalefss %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[0] +; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm4 +; AVX512-NEXT: vscalefss %xmm4, %xmm0, %xmm4 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1] +; AVX512-NEXT: vcvtdq2ps %xmm6, %xmm6 +; AVX512-NEXT: vscalefss %xmm6, %xmm5, %xmm5 +; AVX512-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0] +; AVX512-NEXT: vshufps {{.*#+}} xmm6 = xmm1[2,3,2,3] +; AVX512-NEXT: vcvtdq2ps %xmm6, %xmm6 +; AVX512-NEXT: vscalefss %xmm6, %xmm5, %xmm5 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX512-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[0] +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_16xfloat: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: retq + %r = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> %x, <16 x i32> %exp) + ret <16 x float> %r +} +declare <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float>, <16 x i32>) + +define <8 x double> @test_ldexp_8xdouble(<8 x double> %x, <8 x i32> %exp) { +; AVX512-LABEL: test_ldexp_8xdouble: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX512-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX512-NEXT: vcvtdq2pd %xmm4, %xmm4 +; AVX512-NEXT: vscalefsd %xmm4, %xmm2, %xmm4 +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm5 +; AVX512-NEXT: vcvtdq2pd %xmm3, %xmm6 +; AVX512-NEXT: vscalefsd %xmm6, %xmm5, %xmm6 +; AVX512-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX512-NEXT: vshufps {{.*#+}} xmm7 = xmm1[2,3,2,3] +; AVX512-NEXT: vcvtdq2pd %xmm7, %xmm7 +; AVX512-NEXT: vscalefsd %xmm7, %xmm6, %xmm7 +; AVX512-NEXT: vcvtdq2pd %xmm1, %xmm8 +; AVX512-NEXT: vscalefsd %xmm8, %xmm0, %xmm8 +; AVX512-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm7, %zmm4 +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufps {{.*#+}} xmm7 = xmm3[3,3,3,3] +; AVX512-NEXT: vcvtdq2pd %xmm7, %xmm7 +; AVX512-NEXT: vscalefsd %xmm7, %xmm2, %xmm2 +; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX512-NEXT: vcvtdq2pd %xmm3, %xmm3 +; AVX512-NEXT: vscalefsd %xmm3, %xmm5, %xmm3 +; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; AVX512-NEXT: vcvtdq2pd %xmm3, %xmm3 +; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm6[1,0] +; AVX512-NEXT: vscalefsd %xmm3, %xmm5, %xmm3 +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512-NEXT: vcvtdq2pd %xmm1, %xmm1 +; AVX512-NEXT: vscalefsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm4[0],zmm0[0],zmm4[2],zmm0[2],zmm4[4],zmm0[4],zmm4[6],zmm0[6] +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_8xdouble: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtdq2pd %ymm1, %zmm1 +; AVX512VL-NEXT: vscalefpd %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: retq + %r = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> %x, <8 x i32> %exp) + ret <8 x double> %r +} +declare <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>) diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 063182fcecf3e..1cd25d33031be 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -79,39 +79,59 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) { ; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_pow2_ldexp_4xfloat: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: subq $40, %rsp -; CHECK-AVX-NEXT: .cfi_def_cfa_offset 48 -; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vextractps $1, %xmm0, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vmovd %xmm0, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vextractps $2, %xmm0, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vextractps $3, %xmm0, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; CHECK-AVX-NEXT: addq $40, %rsp -; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX-NEXT: retq - %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> , <4 x i32> %i) +; CHECK-AVX2-LABEL: fmul_pow2_ldexp_4xfloat: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: subq $40, %rsp +; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48 +; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vextractps $1, %xmm0, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vmovd %xmm0, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vextractps $2, %xmm0, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vextractps $3, %xmm0, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-AVX2-NEXT: addq $40, %rsp +; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 +; CHECK-AVX2-NEXT: retq + +; CHECK-AVX512-LABEL: fmul_pow2_ldexp_4xfloat: +; CHECK: # %bb.0: +; CHECK-AVX512-NEXT: vcvtdq2ps %xmm0, %xmm1 +; CHECK-AVX512-NEXT: vmovss .LCPI1_0(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero +; CHECK-AVX512-NEXT: vscalefss %xmm1, %xmm2, %xmm1 +; CHECK-AVX512-NEXT: vshufps $85, %xmm0, %xmm0, %xmm3 # xmm3 = xmm0[1,1,1,1] +; CHECK-AVX512-NEXT: vcvtdq2ps %xmm3, %xmm3 +; CHECK-AVX512-NEXT: vscalefss %xmm3, %xmm2, %xmm3 +; CHECK-AVX512-NEXT: vunpcklps %xmm3, %xmm1, %xmm1 # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-AVX512-NEXT: vshufps $238, %xmm0, %xmm0, %xmm3 # xmm3 = xmm0[2,3,2,3] +; CHECK-AVX512-NEXT: vcvtdq2ps %xmm3, %xmm3 +; CHECK-AVX512-NEXT: vscalefss %xmm3, %xmm2, %xmm3 +; CHECK-AVX512-NEXT: vmovlhps %xmm3, %xmm1, %xmm1 # xmm1 = xmm1[0],xmm3[0] +; CHECK-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 # xmm0 = xmm0[3,3,3,3] +; CHECK-AVX512-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vscalefss %xmm0, %xmm2, %xmm0 +; CHECK-AVX512-NEXT: vinsertps $48, %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-AVX512-NEXT: retq + +%r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> , <4 x i32> %i) ret <4 x float> %r } @@ -543,88 +563,70 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; ; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf: ; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: subq $72, %rsp -; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 80 -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX512F-NEXT: vpextrw $7, %xmm0, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $6, %xmm0, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $5, %xmm0, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $4, %xmm0, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $3, %xmm0, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $2, %xmm0, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $1, %xmm0, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-AVX512F-NEXT: vpextrw $7, %xmm0, %eax +; CHECK-AVX512F-NEXT: cwtl +; CHECK-AVX512F-NEXT: vcvtsi2ss %eax, %xmm1, %xmm2 +; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-AVX512F-NEXT: vscalefss %xmm2, %xmm1, %xmm2 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; CHECK-AVX512F-NEXT: vmovd %xmm2, %eax +; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 +; CHECK-AVX512F-NEXT: vpextrw $6, %xmm0, %eax +; CHECK-AVX512F-NEXT: cwtl +; CHECK-AVX512F-NEXT: vcvtsi2ss %eax, %xmm3, %xmm3 +; CHECK-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; CHECK-AVX512F-NEXT: vmovd %xmm3, %eax +; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 +; CHECK-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; CHECK-AVX512F-NEXT: vpextrw $5, %xmm0, %eax +; CHECK-AVX512F-NEXT: cwtl +; CHECK-AVX512F-NEXT: vcvtsi2ss %eax, %xmm4, %xmm3 +; CHECK-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; CHECK-AVX512F-NEXT: vmovd %xmm3, %eax +; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 +; CHECK-AVX512F-NEXT: vpextrw $4, %xmm0, %eax +; CHECK-AVX512F-NEXT: cwtl +; CHECK-AVX512F-NEXT: vcvtsi2ss %eax, %xmm4, %xmm4 +; CHECK-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; CHECK-AVX512F-NEXT: vmovd %xmm4, %eax +; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4 +; CHECK-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; CHECK-AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-AVX512F-NEXT: vpextrw $3, %xmm0, %eax +; CHECK-AVX512F-NEXT: cwtl +; CHECK-AVX512F-NEXT: vcvtsi2ss %eax, %xmm5, %xmm3 +; CHECK-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; CHECK-AVX512F-NEXT: vmovd %xmm3, %eax +; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 +; CHECK-AVX512F-NEXT: vpextrw $2, %xmm0, %eax +; CHECK-AVX512F-NEXT: cwtl +; CHECK-AVX512F-NEXT: vcvtsi2ss %eax, %xmm5, %xmm4 +; CHECK-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; CHECK-AVX512F-NEXT: vmovd %xmm4, %eax +; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4 +; CHECK-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; CHECK-AVX512F-NEXT: vpextrw $1, %xmm0, %eax +; CHECK-AVX512F-NEXT: cwtl +; CHECK-AVX512F-NEXT: vcvtsi2ss %eax, %xmm5, %xmm4 +; CHECK-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; CHECK-AVX512F-NEXT: vmovd %xmm4, %eax +; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4 ; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: movzwl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-AVX512F-NEXT: callq ldexpf@PLT +; CHECK-AVX512F-NEXT: cwtl +; CHECK-AVX512F-NEXT: vcvtsi2ss %eax, %xmm5, %xmm0 +; CHECK-AVX512F-NEXT: vscalefss %xmm0, %xmm1, %xmm0 ; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax ; CHECK-AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-AVX512F-NEXT: addq $72, %rsp -; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 8 +; CHECK-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; CHECK-AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; CHECK-AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; CHECK-AVX512F-NEXT: retq %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> , <8 x i16> %i) ret <8 x half> %r