Skip to content

Commit

Permalink
Accelerate Vector128<long>::op_Multiply on x64 (#103555)
Browse files Browse the repository at this point in the history
Co-authored-by: Tanner Gooding <tagoo@outlook.com>
  • Loading branch information
EgorBo and tannergooding authored Jun 28, 2024
1 parent 9501cce commit 33ca32d
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 17 deletions.
62 changes: 53 additions & 9 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21492,19 +21492,63 @@ GenTree* Compiler::gtNewSimdBinOpNode(
{
assert((simdSize == 16) || (simdSize == 32) || (simdSize == 64));

if (simdSize == 64)
bool isV512Supported = false;
if (compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512DQ_VL))
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX512DQ));
intrinsic = NI_AVX512DQ_MultiplyLow;
}
else if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1))
{
intrinsic = NI_AVX10v1_MultiplyLow;
if (simdSize == 64)
{
assert(isV512Supported);
intrinsic = NI_AVX512DQ_MultiplyLow;
}
else
{
intrinsic = !isV512Supported ? NI_AVX10v1_MultiplyLow : NI_AVX512DQ_VL_MultiplyLow;
}
}
else
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX512DQ_VL));
intrinsic = NI_AVX512DQ_VL_MultiplyLow;
assert(((simdSize == 16) && compOpportunisticallyDependsOn(InstructionSet_SSE41)) ||
((simdSize == 32) && compOpportunisticallyDependsOn(InstructionSet_AVX2)));

// Make op1 and op2 multi-use:
GenTree* op1Dup = fgMakeMultiUse(&op1);
GenTree* op2Dup = fgMakeMultiUse(&op2);

const bool is256 = simdSize == 32;

// Vector256<ulong> tmp0 = Avx2.Multiply(left, right);
GenTreeHWIntrinsic* tmp0 =
gtNewSimdHWIntrinsicNode(type, op1, op2, is256 ? NI_AVX2_Multiply : NI_SSE2_Multiply,
CORINFO_TYPE_ULONG, simdSize);

// Vector256<uint> tmp1 = Avx2.Shuffle(right.AsUInt32(), ZWXY);
GenTree* shuffleMask = gtNewIconNode(SHUFFLE_ZWXY, TYP_INT);
GenTreeHWIntrinsic* tmp1 = gtNewSimdHWIntrinsicNode(type, op2Dup, shuffleMask,
is256 ? NI_AVX2_Shuffle : NI_SSE2_Shuffle,
CORINFO_TYPE_UINT, simdSize);

// Vector256<uint> tmp2 = Avx2.MultiplyLow(left.AsUInt32(), tmp1);
GenTreeHWIntrinsic* tmp2 =
gtNewSimdHWIntrinsicNode(type, op1Dup, tmp1,
is256 ? NI_AVX2_MultiplyLow : NI_SSE41_MultiplyLow,
CORINFO_TYPE_UINT, simdSize);

// Vector256<int> tmp3 = Avx2.HorizontalAdd(tmp2.AsInt32(), Vector256<int>.Zero);
GenTreeHWIntrinsic* tmp3 =
gtNewSimdHWIntrinsicNode(type, tmp2, gtNewZeroConNode(type),
is256 ? NI_AVX2_HorizontalAdd : NI_SSSE3_HorizontalAdd,
CORINFO_TYPE_UINT, simdSize);

// Vector256<int> tmp4 = Avx2.Shuffle(tmp3, YWXW);
shuffleMask = gtNewIconNode(SHUFFLE_YWXW, TYP_INT);
GenTreeHWIntrinsic* tmp4 =
gtNewSimdHWIntrinsicNode(type, tmp3, shuffleMask, is256 ? NI_AVX2_Shuffle : NI_SSE2_Shuffle,
CORINFO_TYPE_UINT, simdSize);

// result = tmp0 + tmp4;
op1 = tmp0;
op2 = tmp4;
intrinsic = simdSize == 32 ? NI_AVX2_Add : NI_SSE2_Add;
}

break;
Expand Down
22 changes: 15 additions & 7 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2750,17 +2750,25 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

if (varTypeIsLong(simdBaseType))
{
if (simdSize != 64 && !canUseEvexEncoding())
if (TARGET_POINTER_SIZE == 4)
{
// TODO-XARCH-CQ: We should support long/ulong multiplication
// TODO-XARCH-CQ: 32bit support
break;
}
// else if simdSize == 64 then above assert would check if baseline isa supported

#if defined(TARGET_X86)
// TODO-XARCH-CQ: We need to support 64-bit CreateBroadcast
break;
#endif // TARGET_X86
if ((simdSize == 32) && compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
// Emulate NI_AVX512DQ_VL_MultiplyLow with AVX2 for SIMD32
}
else if ((simdSize == 16) && compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
// Emulate NI_AVX512DQ_VL_MultiplyLow with SSE41 for SIMD16
}
else
{
// Software fallback
break;
}
}

CORINFO_ARG_LIST_HANDLE arg1 = sig->args;
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,7 @@ void BroadcastConstantToSimd(TSimd* result, TBase arg0)
#define SHUFFLE_XYZW 0x1B // 00 01 10 11
#define SHUFFLE_YXYX 0x44 // 01 00 01 00
#define SHUFFLE_YWXZ 0x72 // 01 11 00 10
#define SHUFFLE_YWXW 0x73 // 01 11 00 11
#define SHUFFLE_YYZZ 0x5A // 01 01 10 10
#define SHUFFLE_ZXXX 0x80 // 10 00 00 00
#define SHUFFLE_ZXXY 0x81 // 10 00 00 01
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Licensed to the .NET Foundation under one or more agreements.
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Diagnostics;
Expand Down

0 comments on commit 33ca32d

Please sign in to comment.