From ef0e46f8ad10be7bc5269265a3b1101bcee5f040 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Fri, 28 Jun 2024 21:52:34 +0200 Subject: [PATCH 1/6] Optimize Vector128 multiplication for arm64 --- src/coreclr/jit/gentree.cpp | 29 +++++++++++++++++++++++++++- src/coreclr/jit/hwintrinsicarm64.cpp | 4 ++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ce92e74c0175c2..3f66fc482acb8c 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21865,7 +21865,7 @@ GenTree* Compiler::gtNewSimdBinOpNode( case GT_MUL: { - assert(!varTypeIsLong(simdBaseType)); + assert(!varTypeIsLong(simdBaseType) || (simdSize == 16)); GenTree** scalarOp = nullptr; if (varTypeIsArithmetic(op1)) @@ -21929,6 +21929,33 @@ GenTree* Compiler::gtNewSimdBinOpNode( break; } + case TYP_LONG: + case TYP_ULONG: + { + assert(simdSize == 16); + + // Make op1 and op2 multi-use: + GenTree* op1Dup = fgMakeMultiUse(&op1); + GenTree* op2Dup = fgMakeMultiUse(&op2); + + // long left0 = op1.GetElement(0) + // long left1 = op1.GetElement(1) + GenTree* left0 = gtNewSimdGetElementNode(TYP_LONG, op1, gtNewIconNode(0), simdBaseJitType, 16); + GenTree* left1 = gtNewSimdGetElementNode(TYP_LONG, op1Dup, gtNewIconNode(1), simdBaseJitType, 16); + + // long right0 = op2.GetElement(0) + // long right1 = op2.GetElement(1) + GenTree* right0 = gtNewSimdGetElementNode(TYP_LONG, op2, gtNewIconNode(0), simdBaseJitType, 16); + GenTree* right1 = gtNewSimdGetElementNode(TYP_LONG, op2Dup, gtNewIconNode(1), simdBaseJitType, 16); + + // Vector128 vec = Vector128.Create(left0 * right0, left1 * right1) + op1 = gtNewOperNode(GT_MUL, TYP_LONG, left0, right0); + op2 = gtNewOperNode(GT_MUL, TYP_LONG, left1, right1); + GenTree* vec = gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op1, simdBaseJitType, 16); + return gtNewSimdHWIntrinsicNode(TYP_SIMD16, vec, gtNewIconNode(1), op2, NI_AdvSimd_Insert, + simdBaseJitType, 16); + } + default: { unreached(); diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index e7619f3c2c85d0..5213357674f0d1 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -1769,9 +1769,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 2); - if (varTypeIsLong(simdBaseType)) + if (varTypeIsLong(simdBaseType) && (intrinsic == NI_Vector64_op_Multiply)) { - // TODO-ARM64-CQ: We should support long/ulong multiplication. + // long/ulong multiplication needs no special handling for SIMD8 break; } From 77977f5424aa2f7f732097883f91eaa219e0efe8 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sun, 30 Jun 2024 22:49:12 +0200 Subject: [PATCH 2/6] add Vector64 --- src/coreclr/jit/gentree.cpp | 27 +++++++++++++++++---------- src/coreclr/jit/hwintrinsicarm64.cpp | 6 ------ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index c51de6626eaa0a..a7aba241e0df2e 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21976,27 +21976,34 @@ GenTree* Compiler::gtNewSimdBinOpNode( case TYP_LONG: case TYP_ULONG: { - assert(simdSize == 16); - // Make op1 and op2 multi-use: GenTree* op1Dup = fgMakeMultiUse(&op1); GenTree* op2Dup = fgMakeMultiUse(&op2); - // long left0 = op1.GetElement(0) - // long left1 = op1.GetElement(1) - GenTree* left0 = gtNewSimdGetElementNode(TYP_LONG, op1, gtNewIconNode(0), simdBaseJitType, 16); - GenTree* left1 = gtNewSimdGetElementNode(TYP_LONG, op1Dup, gtNewIconNode(1), simdBaseJitType, 16); - + // long left0 = op1.GetElement(0) // long right0 = op2.GetElement(0) + GenTree* left0 = + gtNewSimdGetElementNode(TYP_LONG, op1, gtNewIconNode(0), simdBaseJitType, simdSize); + GenTree* right0 = + gtNewSimdGetElementNode(TYP_LONG, op2, gtNewIconNode(0), simdBaseJitType, simdSize); + + if (simdSize == 8) + { + return gtNewSimdCreateScalarUnsafeNode(TYP_SIMD8, + gtNewOperNode(GT_MUL, TYP_LONG, left0, right0), + simdBaseJitType, 8); + } + // long right1 = op2.GetElement(1) - GenTree* right0 = gtNewSimdGetElementNode(TYP_LONG, op2, gtNewIconNode(0), simdBaseJitType, 16); + // long left1 = op1.GetElement(1) + GenTree* left1 = gtNewSimdGetElementNode(TYP_LONG, op1Dup, gtNewIconNode(1), simdBaseJitType, 16); GenTree* right1 = gtNewSimdGetElementNode(TYP_LONG, op2Dup, gtNewIconNode(1), simdBaseJitType, 16); // Vector128 vec = Vector128.Create(left0 * right0, left1 * right1) op1 = gtNewOperNode(GT_MUL, TYP_LONG, left0, right0); op2 = gtNewOperNode(GT_MUL, TYP_LONG, left1, right1); - GenTree* vec = gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op1, simdBaseJitType, 16); - return gtNewSimdHWIntrinsicNode(TYP_SIMD16, vec, gtNewIconNode(1), op2, NI_AdvSimd_Insert, + GenTree* vec = gtNewSimdCreateScalarUnsafeNode(type, op1, simdBaseJitType, 16); + return gtNewSimdHWIntrinsicNode(type, vec, gtNewIconNode(1), op2, NI_AdvSimd_Insert, simdBaseJitType, 16); } diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 5213357674f0d1..d7adcedcdde9c2 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -1769,12 +1769,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 2); - if (varTypeIsLong(simdBaseType) && (intrinsic == NI_Vector64_op_Multiply)) - { - // long/ulong multiplication needs no special handling for SIMD8 - break; - } - CORINFO_ARG_LIST_HANDLE arg1 = sig->args; CORINFO_ARG_LIST_HANDLE arg2 = info.compCompHnd->getArgNext(arg1); var_types argType = TYP_UNKNOWN; From e0a2942495146eb5ed53739c9db69fa9ccf05e51 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sun, 30 Jun 2024 22:54:04 +0200 Subject: [PATCH 3/6] remove assert --- src/coreclr/jit/gentree.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index a7aba241e0df2e..7648c0850215a0 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21909,7 +21909,6 @@ GenTree* Compiler::gtNewSimdBinOpNode( case GT_MUL: { - assert(!varTypeIsLong(simdBaseType) || (simdSize == 16)); GenTree** scalarOp = nullptr; if (varTypeIsArithmetic(op1)) From 86d4fb31043c98c86487cc3bafdcb7a0c22a55ad Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sun, 30 Jun 2024 22:57:52 +0200 Subject: [PATCH 4/6] add a comment --- src/coreclr/jit/gentree.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 7648c0850215a0..64ce47d2abf592 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21988,6 +21988,7 @@ GenTree* Compiler::gtNewSimdBinOpNode( if (simdSize == 8) { + // Vector64 vec = Vector64.Create(left0 * right0) return gtNewSimdCreateScalarUnsafeNode(TYP_SIMD8, gtNewOperNode(GT_MUL, TYP_LONG, left0, right0), simdBaseJitType, 8); From 818b8bd593adc53b019fbcc63b650ac38ecc1153 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Mon, 1 Jul 2024 00:49:21 +0200 Subject: [PATCH 5/6] clean up --- src/coreclr/jit/gentree.cpp | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 64ce47d2abf592..83bc8c70bf3edb 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21975,27 +21975,26 @@ GenTree* Compiler::gtNewSimdBinOpNode( case TYP_LONG: case TYP_ULONG: { + if (simdSize == 8) + { + // Vector64 vec = Vector64.CreateScalar(op1.ToScalar() * op2.ToScalar()) + op1 = gtNewSimdToScalarNode(TYP_LONG, op1, simdBaseJitType, 8); + op2 = gtNewSimdToScalarNode(TYP_LONG, op2, simdBaseJitType, 8); + GenTreeOp* mul = gtNewOperNode(GT_MUL, TYP_LONG, op1, op2); + return gtNewSimdCreateScalarNode(TYP_SIMD8, mul, simdBaseJitType, 8); + } + // Make op1 and op2 multi-use: GenTree* op1Dup = fgMakeMultiUse(&op1); GenTree* op2Dup = fgMakeMultiUse(&op2); // long left0 = op1.GetElement(0) // long right0 = op2.GetElement(0) - GenTree* left0 = - gtNewSimdGetElementNode(TYP_LONG, op1, gtNewIconNode(0), simdBaseJitType, simdSize); - GenTree* right0 = - gtNewSimdGetElementNode(TYP_LONG, op2, gtNewIconNode(0), simdBaseJitType, simdSize); - - if (simdSize == 8) - { - // Vector64 vec = Vector64.Create(left0 * right0) - return gtNewSimdCreateScalarUnsafeNode(TYP_SIMD8, - gtNewOperNode(GT_MUL, TYP_LONG, left0, right0), - simdBaseJitType, 8); - } + GenTree* left0 = gtNewSimdToScalarNode(TYP_LONG, op1, simdBaseJitType, 16); + GenTree* right0 = gtNewSimdToScalarNode(TYP_LONG, op2, simdBaseJitType, 16); - // long right1 = op2.GetElement(1) // long left1 = op1.GetElement(1) + // long right1 = op2.GetElement(1) GenTree* left1 = gtNewSimdGetElementNode(TYP_LONG, op1Dup, gtNewIconNode(1), simdBaseJitType, 16); GenTree* right1 = gtNewSimdGetElementNode(TYP_LONG, op2Dup, gtNewIconNode(1), simdBaseJitType, 16); From b2206c98ca623f346e4d7e204675219f007e7538 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Tue, 2 Jul 2024 16:10:34 +0200 Subject: [PATCH 6/6] handle scalarOp --- src/coreclr/jit/gentree.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 83bc8c70bf3edb..b0a822b1d16c20 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21978,8 +21978,8 @@ GenTree* Compiler::gtNewSimdBinOpNode( if (simdSize == 8) { // Vector64 vec = Vector64.CreateScalar(op1.ToScalar() * op2.ToScalar()) - op1 = gtNewSimdToScalarNode(TYP_LONG, op1, simdBaseJitType, 8); - op2 = gtNewSimdToScalarNode(TYP_LONG, op2, simdBaseJitType, 8); + op1 = gtNewBitCastNode(TYP_LONG, op1); + op2 = gtNewBitCastNode(TYP_LONG, op2); GenTreeOp* mul = gtNewOperNode(GT_MUL, TYP_LONG, op1, op2); return gtNewSimdCreateScalarNode(TYP_SIMD8, mul, simdBaseJitType, 8); } @@ -21990,20 +21990,22 @@ GenTree* Compiler::gtNewSimdBinOpNode( // long left0 = op1.GetElement(0) // long right0 = op2.GetElement(0) - GenTree* left0 = gtNewSimdToScalarNode(TYP_LONG, op1, simdBaseJitType, 16); - GenTree* right0 = gtNewSimdToScalarNode(TYP_LONG, op2, simdBaseJitType, 16); + GenTree* left0 = gtNewSimdToScalarNode(TYP_LONG, op1, simdBaseJitType, 16); + GenTree* right0 = + scalarOp != nullptr ? op2 : gtNewSimdToScalarNode(TYP_LONG, op2, simdBaseJitType, 16); // long left1 = op1.GetElement(1) // long right1 = op2.GetElement(1) GenTree* left1 = gtNewSimdGetElementNode(TYP_LONG, op1Dup, gtNewIconNode(1), simdBaseJitType, 16); - GenTree* right1 = gtNewSimdGetElementNode(TYP_LONG, op2Dup, gtNewIconNode(1), simdBaseJitType, 16); + GenTree* right1 = scalarOp != nullptr ? op2Dup + : gtNewSimdGetElementNode(TYP_LONG, op2Dup, gtNewIconNode(1), + simdBaseJitType, 16); // Vector128 vec = Vector128.Create(left0 * right0, left1 * right1) op1 = gtNewOperNode(GT_MUL, TYP_LONG, left0, right0); op2 = gtNewOperNode(GT_MUL, TYP_LONG, left1, right1); GenTree* vec = gtNewSimdCreateScalarUnsafeNode(type, op1, simdBaseJitType, 16); - return gtNewSimdHWIntrinsicNode(type, vec, gtNewIconNode(1), op2, NI_AdvSimd_Insert, - simdBaseJitType, 16); + return gtNewSimdWithElementNode(type, vec, gtNewIconNode(1), op2, simdBaseJitType, 16); } default: