Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API Proposal: More SIMD HW Intrinsics #24794

Closed
sdmaclea opened this issue Jan 25, 2018 · 77 comments · Fixed by #36916
Closed

API Proposal: More SIMD HW Intrinsics #24794

sdmaclea opened this issue Jan 25, 2018 · 77 comments · Fixed by #36916
Assignees
Labels
api-approved API was approved in API review, it can be implemented arch-arm64 area-System.Runtime.Intrinsics
Milestone

Comments

@sdmaclea
Copy link
Contributor

@tannergooding: Updated according to match https://github.com/dotnet/corefx/issues/26581#issuecomment-539217015. Previous version is available in comment history.

namespace System.Runtime.Intrinsics.Arm
{
    public static class AdvSimd
    {
        public static bool IsSupported { get { throw null; } }

        /// <summary>
        /// Vector CompareGreaterThanOrEqual
        /// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
        /// Corresponds to vector forms of ARM64 FACGE
        /// </summary>
        public static Vector64<float>   AbsoluteCompareGreaterThanOrEqual(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  AbsoluteCompareGreaterThanOrEqual(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector CompareGreaterThan
        ///
        /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
        ///
        /// Corresponds to vector forms of ARM64 FACGT
        /// </summary>
        public static Vector64<float>   AbsoluteCompareGreaterThan(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  AbsoluteCompareGreaterThan(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector absolute difference
        /// Corresponds to vector forms of ARM64 SABD, UABD & FABD
        /// </summary>
        public static Vector64<byte>    AbsoluteDifference(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<byte>    AbsoluteDifference(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  AbsoluteDifference(Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<ushort>  AbsoluteDifference(Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    AbsoluteDifference(Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<uint>    AbsoluteDifference(Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector64<float>   AbsoluteDifference(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<byte>   AbsoluteDifference(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<byte>   AbsoluteDifference(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
        public static Vector128<ushort> AbsoluteDifference(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
        public static Vector128<ushort> AbsoluteDifference(Vector128<short>  left, Vector128<short>  right) { throw null; }
        public static Vector128<uint>   AbsoluteDifference(Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        public static Vector128<uint>   AbsoluteDifference(Vector128<int>    left, Vector128<int>    right) { throw null; }
        public static Vector128<float>  AbsoluteDifference(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector absolute difference add
        ///
        /// For each element result[elem] = acc[elem] + | left[elem] - right[elem] |
        ///
        /// Corresponds to vector forms of ARM64 SABA, UABA
        /// </summary>
        public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<byte>    acc, Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<sbyte>   acc, Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<ushort>  acc, Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<short>   acc, Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<uint>    acc, Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<int>     acc, Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<byte>   acc, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<sbyte>  acc, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
        public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
        public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<short>  acc, Vector128<short>  left, Vector128<short>  right) { throw null; }
        public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<uint>   acc, Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<int>    acc, Vector128<int>    left, Vector128<int>    right) { throw null; }

        /// <summary>
        /// Vector add pairwise
        /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
        /// Corresponds to vector forms of ARM64 ADDP & FADDP
        /// </summary>
        public static Vector64<byte>   AddPairwise<byte>(Vector64<byte>  left, Vector64<byte>  right)  { throw null; }
        public static Vector64<sbyte>  AddPairwise<sbyte>(Vector64<sbyte>  left, Vector64<sbyte>  right)  { throw null; }
        public static Vector64<ushort> AddPairwise<ushort>(Vector64<ushort>  left, Vector64<ushort>  right)  { throw null; }
        public static Vector64<short>  AddPairwise<short>(Vector64<short>  left, Vector64<short>  right)  { throw null; }
        public static Vector64<int>    AddPairwise<int>(Vector64<int>  left, Vector64<int>  right)  { throw null; }
        public static Vector64<uint>   AddPairwise<uint>(Vector64<uint>  left, Vector64<uint>  right)  { throw null; }
        public static Vector64<float>  AddPairwise<float>(Vector64<float>  left, Vector64<float>  right)  { throw null; }

        /// <summary>
        /// Vector extract from pair of vectors
        /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
        ///
        /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
        ///
        /// Corresponds to vector forms of ARM64 EXT
        /// </summary>
        public static Vector64<byte>   ExtractVector<byte>(Vector64<byte>  left, Vector64<byte>  right, byte index) { throw null; }
        public static Vector64<sbyte>  ExtractVector<sbyte>(Vector64<sbyte>  left, Vector64<sbyte>  right, byte index) { throw null; }
        public static Vector64<short>  ExtractVector<short>(Vector64<short>  left, Vector64<short>  right, byte index) { throw null; }
        public static Vector64<ushort> ExtractVector<ushort>(Vector64<ushort>  left, Vector64<ushort>  right, byte index) { throw null; }
        public static Vector64<int>    ExtractVector<int>(Vector64<int>  left, Vector64<int>  right, byte index) { throw null; }
        public static Vector64<uint>   ExtractVector<uint>(Vector64<uint>  left, Vector64<uint>  right, byte index) { throw null; }

        public static Vector128<byte>   ExtractVector<byte>(Vector128<byte> left, Vector128<byte> right, byte index) { throw null; }
        public static Vector128<sbyte>  ExtractVector<sbyte>(Vector128<sbyte> left, Vector128<sbyte> right, byte index) { throw null; }
        public static Vector128<short>  ExtractVector<short>(Vector128<short> left, Vector128<short> right, byte index) { throw null; }
        public static Vector128<ushort> ExtractVector<ushort>(Vector128<ushort> left, Vector128<ushort> right, byte index) { throw null; }
        public static Vector128<int>    ExtractVector<int>(Vector128<int> left, Vector128<int> right, byte index) { throw null; }
        public static Vector128<uint>   ExtractVector<uint>(Vector128<uint> left, Vector128<uint> right, byte index) { throw null; }
        public static Vector128<long>   ExtractVector<long>(Vector128<long> left, Vector128<long> right, byte index) { throw null; }
        public static Vector128<ulong>  ExtractVector<ulong>(Vector128<ulong> left, Vector128<ulong> right, byte index) { throw null; }
        public static Vector128<float>  ExtractVector<double>(Vector128<float> left, Vector128<float> right, byte index) { throw null; }

        /// <summary>
        /// Vector max numeric
        /// Corresponds to vector forms of ARM64 FMAXNM
        /// </summary>
        public static Vector64<float>   MaxNumeric(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  MaxNumeric(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector max pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 SMAXP, UMAXP & FMAXP
        /// </summary>
        public static Vector64<byte>    MaxPairwise(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   MaxPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  MaxPairwise(Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<short>   MaxPairwise(Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    MaxPairwise(Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<int>     MaxPairwise(Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector64<float>   MaxPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }

        /// <summary>
        /// Vector min numeric
        /// Corresponds to vector forms of ARM64 FMINNM
        /// </summary>
        public static Vector64<float>   MinNumeric(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  MinNumeric(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector min pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 SMINP, UMINP & FMINP
        /// </summary>
        public static Vector64<byte>    MinPairwise(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   MinPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  MinPairwise(Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<short>   MinPairwise(Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    MinPairwise(Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<int>     MinPairwise(Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector64<float>   MinPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }

        /// <summary>
        /// Vector multiply add
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 MLA
        /// </summary>
        public static Vector64<byte>    MultiplyAdd(Vector64<byte>    acc, Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   MultiplyAdd(Vector64<sbyte>   acc, Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  MultiplyAdd(Vector64<ushort>  acc, Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<short>   MultiplyAdd(Vector64<short>   acc, Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    MultiplyAdd(Vector64<uint>    acc, Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<int>     MultiplyAdd(Vector64<int>     acc, Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector128<byte>   MultiplyAdd(Vector128<byte>   acc, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<sbyte>  MultiplyAdd(Vector128<sbyte>  acc, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
        public static Vector128<ushort> MultiplyAdd(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
        public static Vector128<short>  MultiplyAdd(Vector128<short>  acc, Vector128<short>  left, Vector128<short>  right) { throw null; }
        public static Vector128<uint>   MultiplyAdd(Vector128<uint>   acc, Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        public static Vector128<int>    MultiplyAdd(Vector128<int>    acc, Vector128<int>    left, Vector128<int>    right) { throw null; }

        /// <summary>
        /// Vector multiply add by element
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 MLA
        /// </summary>
        public static Vector64<byte>    MultiplyAdd(Vector64<byte>    acc, Vector64<byte>    left, byte    right) { throw null; }
        public static Vector64<sbyte>   MultiplyAdd(Vector64<sbyte>   acc, Vector64<sbyte>   left, sbyte   right) { throw null; }
        public static Vector64<ushort>  MultiplyAdd(Vector64<ushort>  acc, Vector64<ushort>  left, ushort  right) { throw null; }
        public static Vector64<short>   MultiplyAdd(Vector64<short>   acc, Vector64<short>   left, short   right) { throw null; }
        public static Vector64<uint>    MultiplyAdd(Vector64<uint>    acc, Vector64<uint>    left, uint    right) { throw null; }
        public static Vector64<int>     MultiplyAdd(Vector64<int>     acc, Vector64<int>     left, int     right) { throw null; }
        public static Vector128<byte>   MultiplyAdd(Vector128<byte>   acc, Vector128<byte>   left, byte    right) { throw null; }
        public static Vector128<sbyte>  MultiplyAdd(Vector128<sbyte>  acc, Vector128<sbyte>  left, sbyte   right) { throw null; }
        public static Vector128<ushort> MultiplyAdd(Vector128<ushort> acc, Vector128<ushort> left, ushort  right) { throw null; }
        public static Vector128<short>  MultiplyAdd(Vector128<short>  acc, Vector128<short>  left, short   right) { throw null; }
        public static Vector128<uint>   MultiplyAdd(Vector128<uint>   acc, Vector128<uint>   left, uint    right) { throw null; }
        public static Vector128<int>    MultiplyAdd(Vector128<int>    acc, Vector128<int>    left, int     right) { throw null; }

        /// <summary>
        /// Vector multiply subtract
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 MLS
        /// </summary>
        public static Vector64<byte>    MultiplySubtract(Vector64<byte>    acc, Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   MultiplySubtract(Vector64<sbyte>   acc, Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  MultiplySubtract(Vector64<ushort>  acc, Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<short>   MultiplySubtract(Vector64<short>   acc, Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    MultiplySubtract(Vector64<uint>    acc, Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<int>     MultiplySubtract(Vector64<int>     acc, Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector128<byte>   MultiplySubtract(Vector128<byte>   acc, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<sbyte>  MultiplySubtract(Vector128<sbyte>  acc, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
        public static Vector128<ushort> MultiplySubtract(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
        public static Vector128<short>  MultiplySubtract(Vector128<short>  acc, Vector128<short>  left, Vector128<short>  right) { throw null; }
        public static Vector128<uint>   MultiplySubtract(Vector128<uint>   acc, Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        public static Vector128<int>    MultiplySubtract(Vector128<int>    acc, Vector128<int>    left, Vector128<int>    right) { throw null; }

        /// <summary>
        /// Vector multiply subtract by element
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 MLS
        /// </summary>
        public static Vector64<byte>    MultiplySubtract(Vector64<byte>    acc, Vector64<byte>    left, byte    right) { throw null; }
        public static Vector64<sbyte>   MultiplySubtract(Vector64<sbyte>   acc, Vector64<sbyte>   left, sbyte   right) { throw null; }
        public static Vector64<ushort>  MultiplySubtract(Vector64<ushort>  acc, Vector64<ushort>  left, ushort  right) { throw null; }
        public static Vector64<short>   MultiplySubtract(Vector64<short>   acc, Vector64<short>   left, short   right) { throw null; }
        public static Vector64<uint>    MultiplySubtract(Vector64<uint>    acc, Vector64<uint>    left, uint    right) { throw null; }
        public static Vector64<int>     MultiplySubtract(Vector64<int>     acc, Vector64<int>     left, int     right) { throw null; }
        public static Vector128<byte>   MultiplySubtract(Vector128<byte>   acc, Vector128<byte>   left, byte    right) { throw null; }
        public static Vector128<sbyte>  MultiplySubtract(Vector128<sbyte>  acc, Vector128<sbyte>  left, sbyte   right) { throw null; }
        public static Vector128<ushort> MultiplySubtract(Vector128<ushort> acc, Vector128<ushort> left, ushort  right) { throw null; }
        public static Vector128<short>  MultiplySubtract(Vector128<short>  acc, Vector128<short>  left, short   right) { throw null; }
        public static Vector128<uint>   MultiplySubtract(Vector128<uint>   acc, Vector128<uint>   left, uint    right) { throw null; }
        public static Vector128<int>    MultiplySubtract(Vector128<int>    acc, Vector128<int>    left, int     right) { throw null; }

        /// <summary>
        /// Vector fused multiply add
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 FMLA
        /// </summary>
        public static Vector64<float>   FusedMultiplyAdd(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector fused multiply subtract
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 FMLS
        /// </summary>
        public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }


        /// <summary>
        /// Vector polynomial multiply
        /// Corresponds to vector forms of ARM64 PMUL
        /// </summary>
        public static Vector64<byte>    PolynomialMultiply(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   PolynomialMultiply(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector128<byte>   PolynomialMultiply(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<sbyte>  PolynomialMultiply(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }

        /// Vector reciprocal estimate
        ///
        /// See FRECPE docs
        ///
        /// Corresponds to vector forms of ARM64 FRECPE
        /// </summary>
        public static Vector64<float>   ReciprocalEstimate(Vector64<float>   value) { throw null; }
        public static Vector128<float>  ReciprocalEstimate(Vector128<float>  value) { throw null; }

        /// <summary>
        /// Vector reciprocal step
        ///
        /// See FRECPS docs
        ///
        /// Corresponds to vector forms of ARM64 FRECPS
        /// </summary>
        public static Vector64<float>   ReciprocalStep(Vector64<float>   left, Vector64<float>   right, byte index) { throw null; }
        public static Vector128<float>  ReciprocalStep(Vector128<float>  left, Vector128<float>  right, byte index) { throw null; }

        /// <summary>
        /// Vector reciprocal square root estimate
        ///
        /// See FRSQRTE docs
        ///
        /// Corresponds to vector forms of ARM64 FRSQRTE
        /// </summary>
        public static Vector64<float>   ReciprocalSquareRootEstimate(Vector64<float>   value) { throw null; }
        public static Vector128<float>  ReciprocalSquareRootEstimate(Vector128<float>  value) { throw null; }

        /// <summary>
        /// Vector reciprocal square root step
        ///
        /// See FRSQRTS docs
        ///
        /// Corresponds to vector forms of ARM64 FRSQRTS
        /// </summary>
        public static Vector64<float>   ReciprocalSquareRootEstimate(Vector64<float>   left, Vector64<float>   right, byte index) { throw null; }
        public static Vector128<float>  ReciprocalSquareRootEstimate(Vector128<float>  left, Vector128<float>  right, byte index) { throw null; }

        /// <summary>
        /// Vector reverse element bytes
        /// Corresponds to vector forms of ARM64 REV16, REV32, REV64
        /// </summary>
        public static Vector64<ushort>  ReverseElementBytes(Vector64<ushort>  value) { throw null; }
        public static Vector64<short>   ReverseElementBytes(Vector64<short>   value) { throw null; }
        public static Vector64<uint>    ReverseElementBytes(Vector64<uint>    value) { throw null; }
        public static Vector64<int>     ReverseElementBytes(Vector64<int>     value) { throw null; }
        public static Vector64<float>   ReverseElementBytes(Vector64<float>   value) { throw null; }
        public static Vector128<ushort> ReverseElementBytes(Vector128<ushort> value) { throw null; }
        public static Vector128<short>  ReverseElementBytes(Vector128<short>  value) { throw null; }
        public static Vector128<uint>   ReverseElementBytes(Vector128<uint>   value) { throw null; }
        public static Vector128<int>    ReverseElementBytes(Vector128<int>    value) { throw null; }
        public static Vector128<ulong>  ReverseElementBytes(Vector128<ulong>  value) { throw null; }
        public static Vector128<long>   ReverseElementBytes(Vector128<long>   value) { throw null; }
        public static Vector128<float>  ReverseElementBytes(Vector128<float>  value) { throw null; }

        public static class Arm32
        {
            public static bool IsSupported { get { throw null; } }

            /// <summary>
            /// Vector multiply add
            ///
            /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
            ///
            /// Corresponds to vector forms of ARM32 VMLA
            /// </summary>=
            public static Vector64<float>   MultiplyAdd(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MultiplyAdd(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }

            /// <summary>
            /// Vector multiply add by element
            ///
            /// For each element result[elem] = acc[elem] + left[elem] * right
            ///
            /// Corresponds to vector forms of ARM32 VMLA
            /// </summary>
            public static Vector64<float>   MultiplyAdd(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
            public static Vector128<float>  MultiplyAdd(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }

            /// <summary>
            /// Vector multiply subtract
            ///
            /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
            ///
            /// Corresponds to vector forms of ARM32 VMLS
            /// </summary>
            public static Vector64<float>   MultiplySubtract(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MultiplySubtract(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }

            /// <summary>
            /// Vector multiply subtract by element
            ///
            /// For each element result[elem] = acc[elem] - left[elem] * right
            ///
            /// Corresponds to vector forms of ARM32 VMLS
            /// </summary>
            public static Vector64<float>   MultiplySubtract(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
            public static Vector128<float>  MultiplySubtract(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }
        }

        public static class Arm64
        {
            public static bool IsSupported { get { throw null; } }

            /// <summary>
            /// Vector CompareGreaterThanOrEqual
            /// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
            /// Corresponds to vector forms of ARM64 FACGE
            /// </summary>
            public static Vector128<double> AbsoluteCompareGreaterThanOrEqual(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector CompareGreaterThan
            ///
            /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
            ///
            /// Corresponds to vector forms of ARM64 FACGT
            /// </summary>
            public static Vector128<double> AbsoluteCompareGreaterThan(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector absolute difference
            /// Corresponds to vector forms of ARM64 SABD, UABD & FABD
            /// </summary>
            public static Vector128<double> AbsoluteDifference(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector add pairwise
            /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
            /// Corresponds to vector forms of ARM64 ADDP & FADDP
            /// </summary>
            public static Vector128<T>      AddPairwise<byte>(Vector128<byte> left, Vector128<byte> right)  { throw null; }
            public static Vector128<T>      AddPairwise<sbyte>(Vector128<sbyte> left, Vector128<sbyte> right)  { throw null; }
            public static Vector128<T>      AddPairwise<ushort>(Vector128<ushort> left, Vector128<ushort> right)  { throw null; }
            public static Vector128<T>      AddPairwise<short>(Vector128<short> left, Vector128<short> right)  { throw null; }
            public static Vector128<long>   AddPairwise<long>(Vector128<long>  left, Vector128<long>  right)  { throw null; }
            public static Vector128<ulong>  AddPairwise<ulong>(Vector128<ulong>  left, Vector128<ulong>  right)  { throw null; }
            public static Vector128<T>      AddPairwise<float>(Vector128<float> left, Vector128<float> right)  { throw null; }
            public static Vector128<T>      AddPairwise<double>(Vector128<double> left, Vector128<double> right)  { throw null; }

            /// <summary>
            /// Vector extract from pair of vectors
            /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
            ///
            /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
            ///
            /// Corresponds to vector forms of ARM64 EXT
            /// </summary>
            public static Vector128<double> ExtractVector<double>(Vector128<double> left, Vector128<double> right, byte index) { throw null; }

            /// <summary>
            /// Vector add across vector elements
            /// Corresponds to vector forms of ARM64 ADDV
            /// </summary>
            public static byte   AddAcross(Vector64<byte>    value) { throw null; }
            public static sbyte  AddAcross(Vector64<sbyte>   value) { throw null; }
            public static ushort AddAcross(Vector64<ushort>  value) { throw null; }
            public static short  AddAcross(Vector64<short>   value) { throw null; }
            public static uint   AddAcross(Vector64<uint>    value) { throw null; }
            public static int    AddAcross(Vector64<int>     value) { throw null; }
            public static byte   AddAcross(Vector128<byte>   value) { throw null; }
            public static sbyte  AddAcross(Vector128<sbyte>  value) { throw null; }
            public static ushort AddAcross(Vector128<ushort> value) { throw null; }
            public static short  AddAcross(Vector128<short>  value) { throw null; }
            public static uint   AddAcross(Vector128<uint>   value) { throw null; }
            public static int    AddAcross(Vector128<int>    value) { throw null; }

            /// <summary>
            /// Vector max numeric
            /// Corresponds to vector forms of ARM64 FMAXNM
            /// </summary>
            public static Vector128<double> MaxNumeric(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector max numeric pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 FMAXNMP
            /// </summary>
            public static Vector64<float>   MaxNumericPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MaxNumericPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MaxNumericPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector max numeric across
            ///
            /// result = max(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 FMAXNMV
            /// </summary>
            public static float  MaxNumericAcross(Vector128<float>  value) { throw null; }

            /// <summary>
            /// Vector max pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 SMAXP, UMAXP & FMAXP
            /// </summary>
            public static Vector128<byte>   MaxPairwise(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
            public static Vector128<sbyte>  MaxPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<ushort> MaxPairwise(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
            public static Vector128<short>  MaxPairwise(Vector128<short>  left, Vector128<short>  right) { throw null; }
            public static Vector128<uint>   MaxPairwise(Vector128<uint>   left, Vector128<uint>   right) { throw null; }
            public static Vector128<int>    MaxPairwise(Vector128<int>    left, Vector128<int>    right) { throw null; }
            public static Vector128<float>  MaxPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MaxPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector max across
            ///
            /// result = max(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 SMAXV, UMAXV & FMAXV
            /// </summary>
            public static byte   MaxAcross(Vector64<byte>    value) { throw null; }
            public static sbyte  MaxAcross(Vector64<sbyte>   value) { throw null; }
            public static ushort MaxAcross(Vector64<ushort>  value) { throw null; }
            public static short  MaxAcross(Vector64<short>   value) { throw null; }
            public static uint   MaxAcross(Vector64<uint>    value) { throw null; }
            public static int    MaxAcross(Vector64<int>     value) { throw null; }
            public static float  MaxAcross(Vector64<float>   value) { throw null; }
            public static byte   MaxAcross(Vector128<byte>   value) { throw null; }
            public static sbyte  MaxAcross(Vector128<sbyte>  value) { throw null; }
            public static ushort MaxAcross(Vector128<ushort> value) { throw null; }
            public static short  MaxAcross(Vector128<short>  value) { throw null; }
            public static uint   MaxAcross(Vector128<uint>   value) { throw null; }
            public static int    MaxAcross(Vector128<int>    value) { throw null; }
            public static ulong  MaxAcross(Vector128<ulong>  value) { throw null; }
            public static long   MaxAcross(Vector128<long>   value) { throw null; }
            public static float  MaxAcross(Vector128<float>  value) { throw null; }
            public static double MaxAcross(Vector128<double> value) { throw null; }

            /// <summary>
            /// Vector min numeric
            /// Corresponds to vector forms of ARM64 FMINNM
            /// </summary>
            public static Vector128<double> MinNumeric(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector min numeric pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 FMINNMP
            /// </summary>
            public static Vector64<float>   MaxNumericPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MaxNumericPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MaxNumericPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector min numeric across
            ///
            /// result = min(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 FMINNMV
            /// </summary>
            public static float  MaxNumericAcross(Vector128<float>  value) { throw null; }

            /// <summary>
            /// Vector min pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 SMINP, UMINP & FMINP
            /// </summary>
            public static Vector128<byte>   MinPairwise(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
            public static Vector128<sbyte>  MinPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<ushort> MinPairwise(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
            public static Vector128<short>  MinPairwise(Vector128<short>  left, Vector128<short>  right) { throw null; }
            public static Vector128<uint>   MinPairwise(Vector128<uint>   left, Vector128<uint>   right) { throw null; }
            public static Vector128<int>    MinPairwise(Vector128<int>    left, Vector128<int>    right) { throw null; }
            public static Vector128<float>  MinPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MinPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector min across
            ///
            /// result = max(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 SMINV, UMINV & FMINV
            /// </summary>
            public static byte   MinAcross(Vector64<byte>    value) { throw null; }
            public static sbyte  MinAcross(Vector64<sbyte>   value) { throw null; }
            public static ushort MinAcross(Vector64<ushort>  value) { throw null; }
            public static short  MinAcross(Vector64<short>   value) { throw null; }
            public static uint   MinAcross(Vector64<uint>    value) { throw null; }
            public static int    MinAcross(Vector64<int>     value) { throw null; }
            public static float  MinAcross(Vector64<float>   value) { throw null; }
            public static byte   MinAcross(Vector128<byte>   value) { throw null; }
            public static sbyte  MinAcross(Vector128<sbyte>  value) { throw null; }
            public static ushort MinAcross(Vector128<ushort> value) { throw null; }
            public static short  MinAcross(Vector128<short>  value) { throw null; }
            public static uint   MinAcross(Vector128<uint>   value) { throw null; }
            public static int    MinAcross(Vector128<int>    value) { throw null; }
            public static float  MinAcross(Vector128<float>  value) { throw null; }
            public static double MinAcross(Vector128<double> value) { throw null; }

            /// <summary>
            /// Vector fused multiply add
            ///
            /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
            ///
            /// Corresponds to vector forms of ARM64 FMLA
            /// </summary>
            public static Vector128<double> FusedMultiplyAdd(Vector128<double> acc, Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector fused multiply add by element
            ///
            /// For each element result[elem] = acc[elem] + left[elem] * right
            ///
            /// Corresponds to vector forms of ARM64 FMLA
            /// </summary>
            public static Vector64<float>   FusedMultiplyAdd(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
            public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }
            public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }

            /// <summary>
            /// Vector fused multiply subtract
            ///
            /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
            ///
            /// Corresponds to vector forms of ARM64 FMLS
            /// </summary>
            public static Vector128<double>  FusedMultiplySubtract(Vector128<double>  acc, Vector128<double>  left, Vector128<double>  right) { throw null; }

            /// <summary>
            /// Vector fused multiply subtract by element
            ///
            /// For each element result[elem] = acc[elem] - left[elem] * right
            ///
            /// Corresponds to vector forms of ARM64 FMLS
            /// </summary>
            public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
            public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }
            public static Vector128<double> FusedMultiplySubtract(Vector128<double> acc, Vector128<double> left, float   right) { throw null; }

            /// <summary>
            /// Vector multiply extend
            ///
            /// For each element result[elem] = left[elem] * right[elem]
            /// Handle extend special cases zero and infinite.  FMULX
            ///
            /// Corresponds to vector forms of ARM64 FMULX
            /// </summary>
            public static Vector64<float>   MultiplyExtend(Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MultiplyExtend(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MultiplyExtend(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector multiply extend by element
            ///
            /// For each element result[elem] = left[elem] * right
            /// Handle extend special cases zero and infinite.  FMULX
            ///
            /// Corresponds to vector forms of ARM64 FMULX
            /// </summary>
            public static Vector64<float>   MultiplyExtend(Vector64<float>   left, float  right) { throw null; }
            public static Vector128<float>  MultiplyExtend(Vector128<float>  left, float  right) { throw null; }
            public static Vector128<double> MultiplyExtend(Vector128<double> left, double right) { throw null; }

            /// Vector reciprocal estimate
            ///
            /// See FRECPE docs
            ///
            /// Corresponds to vector forms of ARM64 FRECPE
            /// </summary>
            public static Vector128<double> ReciprocalEstimate(Vector128<double> value) { throw null; }

            /// <summary>
            /// Vector reciprocal step
            ///
            /// See FRECPS docs
            ///
            /// Corresponds to vector forms of ARM64 FRECPS
            /// </summary>
            public static Vector128<double> ReciprocalStep(Vector128<double> left, Vector128<double> right, byte index) { throw null; }

            /// <summary>
            /// Vector reciprocal square root estimate
            ///
            /// See FRSQRTE docs
            ///
            /// Corresponds to vector forms of ARM64 FRSQRTE
            /// </summary>
            public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> value) { throw null; }

            /// <summary>
            /// Vector reciprocal square root step
            ///
            /// See FRSQRTS docs
            ///
            /// Corresponds to vector forms of ARM64 FRSQRTS
            /// </summary>
            public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> left, Vector128<double> right, byte index) { throw null; }

            /// <summary>
            /// Vector reverse byte bits
            /// Corresponds to vector forms of ARM64 RBIT
            /// </summary>
            public static Vector64<byte>    ReverseElementBits(Vector64<byte>    value) { throw null; }
            public static Vector64<sbyte>   ReverseElementBits(Vector64<sbyte>   value) { throw null; }
            public static Vector128<byte>   ReverseElementBits(Vector128<byte>   value) { throw null; }
            public static Vector128<sbyte>  ReverseElementBits(Vector128<sbyte>  value) { throw null; }
        }
  }
}
@sdmaclea
Copy link
Contributor Author

This is the next wave of SIMD instructions which I plan to implement

@CarolEidt @RussKeldorph @eerhardt PTAL
@tannergooding @4creators @fiigii @dotnet/arm64-contrib @dotnet/jit-contrib

@sdmaclea
Copy link
Contributor Author

This introduces more complicated intrinsic overloads. For instance

  • Multiply(Vector64<float>, Vector64<float>)
  • Multiply(Vector64<float>, float)

This introduces some implementation complexity. lookupHWIntrinsic must check Method arguments.

It may make the API less safe.

The second form represents multiply by vector element. It could be renamed to

  • MultiplyByElement(Vector64<float>, float)

There are other cases Extract, MultiplyAdd ...

Opinions?

@sdmaclea
Copy link
Contributor Author

sdmaclea commented Jan 30, 2018

Looks like X86 intrinsics is using MultiplyScalar(Vector64<float>, float) for Arm64's
MultiplyByElement(Vector64<float>, float). I'm OK with renaming.

@fiigii
Copy link
Contributor

fiigii commented Jan 30, 2018

Looks like X86 intrinsics is using MultiplyScalar(Vector64, float) for Arm64's
MultiplyByElement(Vector64, float). I'm OK with renaming.

X86 does not have MultiplyScalar(Vector64<float>, float). In Intel HW intrinsics, Scalar means operating over Vector128<T> but only computing the first element.

@TamarChristinaArm
Copy link
Contributor

Hi, I'm wondering what the status of this proposal is?

@sdmaclea
Copy link
Contributor Author

sdmaclea commented Oct 7, 2019

@TamarChristinaArm When I moved to Microsoft, I stopped championing this. In my opinion it was in good shape when I left it.

The proposal is probably a little out of date. When some of the other intrinsic API were approved, the namespace changed.

If someone was motivated to implement this and we had consensus, the next step would be to mark this as API ready for review and have an API design review.

@TamarChristinaArm
Copy link
Contributor

In my opinion it was in good shape when I left it.

@sdmaclea I agree. I have some of this implemented as I was going off a different list, but It would be best just to get these approved.

The namespace change shouldn't affect this much so I think it's fine to review as is. The other approved APIs need slight changes due to the namespaces too but it's easier to have them all approved as is (as it's mostly about the intrinsics themselves no? the namespace they end up in is determined by the ISA).

If someone was motivated to implement this and we had consensus, the next step would be to mark this as API ready for review and have an API design review.

How should I go about this? do I just add the label?

@sdmaclea
Copy link
Contributor Author

sdmaclea commented Oct 7, 2019

I doubt you can add the label. I'll add it. I just tried but GitHub seems to have issues at the moment.

@tannergooding
Copy link
Member

The namespace change shouldn't affect this much so I think it's fine to review as is.

Part of this is trivial and just involves updating the class name and namespace name.

The more difficult part comes from pulling out what is shared vs what is ARM64 specific; which needs to be done anyways.

I'm fine with marking this ready-for-review, but I'd like to see us get the proposal updated before it is reviewed, if possible. It tends to make the entire process easier and is ultimately part of implementing it anyways.

@sdmaclea
Copy link
Contributor Author

sdmaclea commented Oct 7, 2019

@terrajobst Can we schedule this for API review? Can we include Arm on the call?

@sdmaclea
Copy link
Contributor Author

sdmaclea commented Oct 7, 2019

@tannergooding @TamarChristinaArm My head is not in this space at the moment. I am happy to play admin, but I can't drive this.

If you comment on what needs changed, I am happy to update.

Either way marking ready for review seems fine. It will take at least a week to schedule the review. We should clean up as quickly as practical.

@TamarChristinaArm
Copy link
Contributor

I'll post the changes require to adhere to dotnet/corefx#37199 today. I've already started on it.

@tannergooding
Copy link
Member

If you comment on what needs changed, I am happy to update.

I can modify the original comment on any of these and you shouldn't need to worry about it 😄

I can also handle ensuring this gets a slot on the API review schedule, etc.

@TamarChristinaArm
Copy link
Contributor

TamarChristinaArm commented Oct 7, 2019

Extract in this proposal uses the same name for the intrinsics as in #24588, there's no clash because the overloads are different but they are completely different intrinsics. Should this one instead be something like ExtractVector?

@tannergooding
Copy link
Member

I think ExtractVector makes sense, given my understanding of the API.

@TamarChristinaArm
Copy link
Contributor

TamarChristinaArm commented Oct 7, 2019

new list below

changes:

  • drop Multiply (already implemented in API Proposal: Arm64 Simd Insert and Extract elements #24588)
  • rename Extract to ExtractVector
  • expand AddPairwise away from generics
  • expand ExtractVector away from generics
  • drop unsupported overloads, e.g. from MaxNumericAcross.
  • updated some comments
  • remove FRECPX as that has no vector versions
  • Separate shared and A64 only intrinsics
  • Separate MLA/MLS form FMA/FMS

I believe this is the full rewritten list, I've asked a question internally about MLA and will update this tomorrow if needed when I get an answer.

namespace System.Runtime.Intrinsics.Arm
{
    public static class Simd
    {
        public static bool IsSupported { get { throw null; } }

        /// <summary>
        /// Vector CompareGreaterThanOrEqual
        /// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
        /// Corresponds to vector forms of ARM64 FACGE
        /// </summary>
        public static Vector64<float>   AbsoluteCompareGreaterThanOrEqual(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  AbsoluteCompareGreaterThanOrEqual(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector CompareGreaterThan
        ///
        /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
        ///
        /// Corresponds to vector forms of ARM64 FACGT
        /// </summary>
        public static Vector64<float>   AbsoluteCompareGreaterThan(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  AbsoluteCompareGreaterThan(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector absolute difference
        /// Corresponds to vector forms of ARM64 SABD, UABD & FABD
        /// </summary>
        public static Vector64<byte>    AbsoluteDifference(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<byte>    AbsoluteDifference(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  AbsoluteDifference(Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<ushort>  AbsoluteDifference(Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    AbsoluteDifference(Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<uint>    AbsoluteDifference(Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector64<float>   AbsoluteDifference(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<byte>   AbsoluteDifference(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<byte>   AbsoluteDifference(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
        public static Vector128<ushort> AbsoluteDifference(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
        public static Vector128<ushort> AbsoluteDifference(Vector128<short>  left, Vector128<short>  right) { throw null; }
        public static Vector128<uint>   AbsoluteDifference(Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        public static Vector128<uint>   AbsoluteDifference(Vector128<int>    left, Vector128<int>    right) { throw null; }
        public static Vector128<float>  AbsoluteDifference(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector absolute difference add
        ///
        /// For each element result[elem] = acc[elem] + | left[elem] - right[elem] |
        ///
        /// Corresponds to vector forms of ARM64 SABA, UABA
        /// </summary>
        public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<byte>    acc, Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<sbyte>   acc, Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<ushort>  acc, Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<short>   acc, Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<uint>    acc, Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<int>     acc, Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<byte>   acc, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<sbyte>  acc, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
        public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
        public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<short>  acc, Vector128<short>  left, Vector128<short>  right) { throw null; }
        public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<uint>   acc, Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<int>    acc, Vector128<int>    left, Vector128<int>    right) { throw null; }

        /// <summary>
        /// Vector add pairwise
        /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
        /// Corresponds to vector forms of ARM64 ADDP & FADDP
        /// </summary>
        public static Vector64<byte>   AddPairwise<byte>(Vector64<byte>  left, Vector64<byte>  right)  { throw null; }
        public static Vector64<sbyte>  AddPairwise<sbyte>(Vector64<sbyte>  left, Vector64<sbyte>  right)  { throw null; }
        public static Vector64<ushort> AddPairwise<ushort>(Vector64<ushort>  left, Vector64<ushort>  right)  { throw null; }
        public static Vector64<short>  AddPairwise<short>(Vector64<short>  left, Vector64<short>  right)  { throw null; }
        public static Vector64<int>    AddPairwise<int>(Vector64<int>  left, Vector64<int>  right)  { throw null; }
        public static Vector64<uint>   AddPairwise<uint>(Vector64<uint>  left, Vector64<uint>  right)  { throw null; }
        public static Vector64<float>  AddPairwise<float>(Vector64<float>  left, Vector64<float>  right)  { throw null; }

        /// <summary>
        /// Vector extract from pair of vectors
        /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
        ///
        /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
        ///
        /// Corresponds to vector forms of ARM64 EXT
        /// </summary>
        public static Vector64<byte>   ExtractVector<byte>(Vector64<byte>  left, Vector64<byte>  right, byte index) { throw null; }
        public static Vector64<sbyte>  ExtractVector<sbyte>(Vector64<sbyte>  left, Vector64<sbyte>  right, byte index) { throw null; }
        public static Vector64<short>  ExtractVector<short>(Vector64<short>  left, Vector64<short>  right, byte index) { throw null; }
        public static Vector64<ushort> ExtractVector<ushort>(Vector64<ushort>  left, Vector64<ushort>  right, byte index) { throw null; }
        public static Vector64<int>    ExtractVector<int>(Vector64<int>  left, Vector64<int>  right, byte index) { throw null; }
        public static Vector64<uint>   ExtractVector<uint>(Vector64<uint>  left, Vector64<uint>  right, byte index) { throw null; }

        public static Vector128<byte>   ExtractVector<byte>(Vector128<byte> left, Vector128<byte> right, byte index) { throw null; }
        public static Vector128<sbyte>  ExtractVector<sbyte>(Vector128<sbyte> left, Vector128<sbyte> right, byte index) { throw null; }
        public static Vector128<short>  ExtractVector<short>(Vector128<short> left, Vector128<short> right, byte index) { throw null; }
        public static Vector128<ushort> ExtractVector<ushort>(Vector128<ushort> left, Vector128<ushort> right, byte index) { throw null; }
        public static Vector128<int>    ExtractVector<int>(Vector128<int> left, Vector128<int> right, byte index) { throw null; }
        public static Vector128<uint>   ExtractVector<uint>(Vector128<uint> left, Vector128<uint> right, byte index) { throw null; }
        public static Vector128<long>   ExtractVector<long>(Vector128<long> left, Vector128<long> right, byte index) { throw null; }
        public static Vector128<ulong>  ExtractVector<ulong>(Vector128<ulong> left, Vector128<ulong> right, byte index) { throw null; }
        public static Vector128<float>  ExtractVector<double>(Vector128<float> left, Vector128<float> right, byte index) { throw null; }

        /// <summary>
        /// Vector max numeric
        /// Corresponds to vector forms of ARM64 FMAXNM
        /// </summary>
        public static Vector64<float>   MaxNumeric(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  MaxNumeric(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector max pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 SMAXP, UMAXP & FMAXP
        /// </summary>
        public static Vector64<byte>    MaxPairwise(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   MaxPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  MaxPairwise(Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<short>   MaxPairwise(Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    MaxPairwise(Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<int>     MaxPairwise(Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector64<float>   MaxPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }

        /// <summary>
        /// Vector min numeric
        /// Corresponds to vector forms of ARM64 FMINNM
        /// </summary>
        public static Vector64<float>   MinNumeric(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  MinNumeric(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector min pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 SMINP, UMINP & FMINP
        /// </summary>
        public static Vector64<byte>    MinPairwise(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   MinPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  MinPairwise(Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<short>   MinPairwise(Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    MinPairwise(Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<int>     MinPairwise(Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector64<float>   MinPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }

        /// <summary>
        /// Vector multiply add
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 MLA
        /// </summary>
        public static Vector64<byte>    MultiplyAdd(Vector64<byte>    acc, Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   MultiplyAdd(Vector64<sbyte>   acc, Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  MultiplyAdd(Vector64<ushort>  acc, Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<short>   MultiplyAdd(Vector64<short>   acc, Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    MultiplyAdd(Vector64<uint>    acc, Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<int>     MultiplyAdd(Vector64<int>     acc, Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector128<byte>   MultiplyAdd(Vector128<byte>   acc, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<sbyte>  MultiplyAdd(Vector128<sbyte>  acc, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
        public static Vector128<ushort> MultiplyAdd(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
        public static Vector128<short>  MultiplyAdd(Vector128<short>  acc, Vector128<short>  left, Vector128<short>  right) { throw null; }
        public static Vector128<uint>   MultiplyAdd(Vector128<uint>   acc, Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        public static Vector128<int>    MultiplyAdd(Vector128<int>    acc, Vector128<int>    left, Vector128<int>    right) { throw null; }

        /// <summary>
        /// Vector multiply add by element
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 MLA
        /// </summary>
        public static Vector64<byte>    MultiplyAdd(Vector64<byte>    acc, Vector64<byte>    left, byte    right) { throw null; }
        public static Vector64<sbyte>   MultiplyAdd(Vector64<sbyte>   acc, Vector64<sbyte>   left, sbyte   right) { throw null; }
        public static Vector64<ushort>  MultiplyAdd(Vector64<ushort>  acc, Vector64<ushort>  left, ushort  right) { throw null; }
        public static Vector64<short>   MultiplyAdd(Vector64<short>   acc, Vector64<short>   left, short   right) { throw null; }
        public static Vector64<uint>    MultiplyAdd(Vector64<uint>    acc, Vector64<uint>    left, uint    right) { throw null; }
        public static Vector64<int>     MultiplyAdd(Vector64<int>     acc, Vector64<int>     left, int     right) { throw null; }
        public static Vector128<byte>   MultiplyAdd(Vector128<byte>   acc, Vector128<byte>   left, byte    right) { throw null; }
        public static Vector128<sbyte>  MultiplyAdd(Vector128<sbyte>  acc, Vector128<sbyte>  left, sbyte   right) { throw null; }
        public static Vector128<ushort> MultiplyAdd(Vector128<ushort> acc, Vector128<ushort> left, ushort  right) { throw null; }
        public static Vector128<short>  MultiplyAdd(Vector128<short>  acc, Vector128<short>  left, short   right) { throw null; }
        public static Vector128<uint>   MultiplyAdd(Vector128<uint>   acc, Vector128<uint>   left, uint    right) { throw null; }
        public static Vector128<int>    MultiplyAdd(Vector128<int>    acc, Vector128<int>    left, int     right) { throw null; }

        /// <summary>
        /// Vector multiply subtract
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 MLS
        /// </summary>
        public static Vector64<byte>    MultiplySubtract(Vector64<byte>    acc, Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   MultiplySubtract(Vector64<sbyte>   acc, Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  MultiplySubtract(Vector64<ushort>  acc, Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<short>   MultiplySubtract(Vector64<short>   acc, Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    MultiplySubtract(Vector64<uint>    acc, Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<int>     MultiplySubtract(Vector64<int>     acc, Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector128<byte>   MultiplySubtract(Vector128<byte>   acc, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<sbyte>  MultiplySubtract(Vector128<sbyte>  acc, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
        public static Vector128<ushort> MultiplySubtract(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
        public static Vector128<short>  MultiplySubtract(Vector128<short>  acc, Vector128<short>  left, Vector128<short>  right) { throw null; }
        public static Vector128<uint>   MultiplySubtract(Vector128<uint>   acc, Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        public static Vector128<int>    MultiplySubtract(Vector128<int>    acc, Vector128<int>    left, Vector128<int>    right) { throw null; }

        /// <summary>
        /// Vector multiply subtract by element
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 MLS
        /// </summary>
        public static Vector64<byte>    MultiplySubtract(Vector64<byte>    acc, Vector64<byte>    left, byte    right) { throw null; }
        public static Vector64<sbyte>   MultiplySubtract(Vector64<sbyte>   acc, Vector64<sbyte>   left, sbyte   right) { throw null; }
        public static Vector64<ushort>  MultiplySubtract(Vector64<ushort>  acc, Vector64<ushort>  left, ushort  right) { throw null; }
        public static Vector64<short>   MultiplySubtract(Vector64<short>   acc, Vector64<short>   left, short   right) { throw null; }
        public static Vector64<uint>    MultiplySubtract(Vector64<uint>    acc, Vector64<uint>    left, uint    right) { throw null; }
        public static Vector64<int>     MultiplySubtract(Vector64<int>     acc, Vector64<int>     left, int     right) { throw null; }
        public static Vector128<byte>   MultiplySubtract(Vector128<byte>   acc, Vector128<byte>   left, byte    right) { throw null; }
        public static Vector128<sbyte>  MultiplySubtract(Vector128<sbyte>  acc, Vector128<sbyte>  left, sbyte   right) { throw null; }
        public static Vector128<ushort> MultiplySubtract(Vector128<ushort> acc, Vector128<ushort> left, ushort  right) { throw null; }
        public static Vector128<short>  MultiplySubtract(Vector128<short>  acc, Vector128<short>  left, short   right) { throw null; }
        public static Vector128<uint>   MultiplySubtract(Vector128<uint>   acc, Vector128<uint>   left, uint    right) { throw null; }
        public static Vector128<int>    MultiplySubtract(Vector128<int>    acc, Vector128<int>    left, int     right) { throw null; }

        /// <summary>
        /// Vector fused multiply add
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 FMLA
        /// </summary>
        public static Vector64<float>   FusedMultiplyAdd(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector fused multiply subtract
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 FMLS
        /// </summary>
        public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }


        /// <summary>
        /// Vector polynomial multiply
        /// Corresponds to vector forms of ARM64 PMUL
        /// </summary>
        public static Vector64<byte>    PolynomialMultiply(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   PolynomialMultiply(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector128<byte>   PolynomialMultiply(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<sbyte>  PolynomialMultiply(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }

        /// Vector reciprocal estimate
        ///
        /// See FRECPE docs
        ///
        /// Corresponds to vector forms of ARM64 FRECPE
        /// </summary>
        public static Vector64<float>   ReciprocalEstimate(Vector64<float>   value) { throw null; }
        public static Vector128<float>  ReciprocalEstimate(Vector128<float>  value) { throw null; }

        /// <summary>
        /// Vector reciprocal step
        ///
        /// See FRECPS docs
        ///
        /// Corresponds to vector forms of ARM64 FRECPS
        /// </summary>
        public static Vector64<float>   ReciprocalStep(Vector64<float>   left, Vector64<float>   right, byte index) { throw null; }
        public static Vector128<float>  ReciprocalStep(Vector128<float>  left, Vector128<float>  right, byte index) { throw null; }

        /// <summary>
        /// Vector reciprocal square root estimate
        ///
        /// See FRSQRTE docs
        ///
        /// Corresponds to vector forms of ARM64 FRSQRTE
        /// </summary>
        public static Vector64<float>   ReciprocalSquareRootEstimate(Vector64<float>   value) { throw null; }
        public static Vector128<float>  ReciprocalSquareRootEstimate(Vector128<float>  value) { throw null; }

        /// <summary>
        /// Vector reciprocal square root step
        ///
        /// See FRSQRTS docs
        ///
        /// Corresponds to vector forms of ARM64 FRSQRTS
        /// </summary>
        public static Vector64<float>   ReciprocalSquareRootEstimate(Vector64<float>   left, Vector64<float>   right, byte index) { throw null; }
        public static Vector128<float>  ReciprocalSquareRootEstimate(Vector128<float>  left, Vector128<float>  right, byte index) { throw null; }

        /// <summary>
        /// Vector reverse element bytes
        /// Corresponds to vector forms of ARM64 REV16, REV32, REV64
        /// </summary>
        public static Vector64<ushort>  ReverseElementBytes(Vector64<ushort>  value) { throw null; }
        public static Vector64<short>   ReverseElementBytes(Vector64<short>   value) { throw null; }
        public static Vector64<uint>    ReverseElementBytes(Vector64<uint>    value) { throw null; }
        public static Vector64<int>     ReverseElementBytes(Vector64<int>     value) { throw null; }
        public static Vector64<float>   ReverseElementBytes(Vector64<float>   value) { throw null; }
        public static Vector128<ushort> ReverseElementBytes(Vector128<ushort> value) { throw null; }
        public static Vector128<short>  ReverseElementBytes(Vector128<short>  value) { throw null; }
        public static Vector128<uint>   ReverseElementBytes(Vector128<uint>   value) { throw null; }
        public static Vector128<int>    ReverseElementBytes(Vector128<int>    value) { throw null; }
        public static Vector128<ulong>  ReverseElementBytes(Vector128<ulong>  value) { throw null; }
        public static Vector128<long>   ReverseElementBytes(Vector128<long>   value) { throw null; }
        public static Vector128<float>  ReverseElementBytes(Vector128<float>  value) { throw null; }

        public static class Arm32
        {
            public static bool IsSupported { get { throw null; } }

            /// <summary>
            /// Vector multiply add
            ///
            /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
            ///
            /// Corresponds to vector forms of ARM32 VMLA
            /// </summary>=
            public static Vector64<float>   MultiplyAdd(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MultiplyAdd(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }

            /// <summary>
            /// Vector multiply add by element
            ///
            /// For each element result[elem] = acc[elem] + left[elem] * right
            ///
            /// Corresponds to vector forms of ARM32 VMLA
            /// </summary>
            public static Vector64<float>   MultiplyAdd(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
            public static Vector128<float>  MultiplyAdd(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }

            /// <summary>
            /// Vector multiply subtract
            ///
            /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
            ///
            /// Corresponds to vector forms of ARM32 VMLS
            /// </summary>
            public static Vector64<float>   MultiplySubtract(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MultiplySubtract(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }

            /// <summary>
            /// Vector multiply subtract by element
            ///
            /// For each element result[elem] = acc[elem] - left[elem] * right
            ///
            /// Corresponds to vector forms of ARM32 VMLS
            /// </summary>
            public static Vector64<float>   MultiplySubtract(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
            public static Vector128<float>  MultiplySubtract(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }
        }

        public static class Arm64
        {
            public static bool IsSupported { get { throw null; } }

            /// <summary>
            /// Vector CompareGreaterThanOrEqual
            /// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
            /// Corresponds to vector forms of ARM64 FACGE
            /// </summary>
            public static Vector128<double> AbsoluteCompareGreaterThanOrEqual(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector CompareGreaterThan
            ///
            /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
            ///
            /// Corresponds to vector forms of ARM64 FACGT
            /// </summary>
            public static Vector128<double> AbsoluteCompareGreaterThan(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector absolute difference
            /// Corresponds to vector forms of ARM64 SABD, UABD & FABD
            /// </summary>
            public static Vector128<double> AbsoluteDifference(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector add pairwise
            /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
            /// Corresponds to vector forms of ARM64 ADDP & FADDP
            /// </summary>
            public static Vector128<T>      AddPairwise<byte>(Vector128<byte> left, Vector128<byte> right)  { throw null; }
            public static Vector128<T>      AddPairwise<sbyte>(Vector128<sbyte> left, Vector128<sbyte> right)  { throw null; }
            public static Vector128<T>      AddPairwise<ushort>(Vector128<ushort> left, Vector128<ushort> right)  { throw null; }
            public static Vector128<T>      AddPairwise<short>(Vector128<short> left, Vector128<short> right)  { throw null; }
            public static Vector128<long>   AddPairwise<long>(Vector128<long>  left, Vector128<long>  right)  { throw null; }
            public static Vector128<ulong>  AddPairwise<ulong>(Vector128<ulong>  left, Vector128<ulong>  right)  { throw null; }
            public static Vector128<T>      AddPairwise<float>(Vector128<float> left, Vector128<float> right)  { throw null; }
            public static Vector128<T>      AddPairwise<double>(Vector128<double> left, Vector128<double> right)  { throw null; }

            /// <summary>
            /// Vector extract from pair of vectors
            /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
            ///
            /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
            ///
            /// Corresponds to vector forms of ARM64 EXT
            /// </summary>
            public static Vector128<double> ExtractVector<double>(Vector128<double> left, Vector128<double> right, byte index) { throw null; }

            /// <summary>
            /// Vector add across vector elements
            /// Corresponds to vector forms of ARM64 ADDV
            /// </summary>
            public static byte   AddAcross(Vector64<byte>    value) { throw null; }
            public static sbyte  AddAcross(Vector64<sbyte>   value) { throw null; }
            public static ushort AddAcross(Vector64<ushort>  value) { throw null; }
            public static short  AddAcross(Vector64<short>   value) { throw null; }
            public static uint   AddAcross(Vector64<uint>    value) { throw null; }
            public static int    AddAcross(Vector64<int>     value) { throw null; }
            public static byte   AddAcross(Vector128<byte>   value) { throw null; }
            public static sbyte  AddAcross(Vector128<sbyte>  value) { throw null; }
            public static ushort AddAcross(Vector128<ushort> value) { throw null; }
            public static short  AddAcross(Vector128<short>  value) { throw null; }
            public static uint   AddAcross(Vector128<uint>   value) { throw null; }
            public static int    AddAcross(Vector128<int>    value) { throw null; }

            /// <summary>
            /// Vector max numeric
            /// Corresponds to vector forms of ARM64 FMAXNM
            /// </summary>
            public static Vector128<double> MaxNumeric(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector max numeric pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 FMAXNMP
            /// </summary>
            public static Vector64<float>   MaxNumericPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MaxNumericPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MaxNumericPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector max numeric across
            ///
            /// result = max(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 FMAXNMV
            /// </summary>
            public static float  MaxNumericAcross(Vector128<float>  value) { throw null; }

            /// <summary>
            /// Vector max pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 SMAXP, UMAXP & FMAXP
            /// </summary>
            public static Vector128<byte>   MaxPairwise(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
            public static Vector128<sbyte>  MaxPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<ushort> MaxPairwise(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
            public static Vector128<short>  MaxPairwise(Vector128<short>  left, Vector128<short>  right) { throw null; }
            public static Vector128<uint>   MaxPairwise(Vector128<uint>   left, Vector128<uint>   right) { throw null; }
            public static Vector128<int>    MaxPairwise(Vector128<int>    left, Vector128<int>    right) { throw null; }
            public static Vector128<float>  MaxPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MaxPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector max across
            ///
            /// result = max(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 SMAXV, UMAXV & FMAXV
            /// </summary>
            public static byte   MaxAcross(Vector64<byte>    value) { throw null; }
            public static sbyte  MaxAcross(Vector64<sbyte>   value) { throw null; }
            public static ushort MaxAcross(Vector64<ushort>  value) { throw null; }
            public static short  MaxAcross(Vector64<short>   value) { throw null; }
            public static uint   MaxAcross(Vector64<uint>    value) { throw null; }
            public static int    MaxAcross(Vector64<int>     value) { throw null; }
            public static float  MaxAcross(Vector64<float>   value) { throw null; }
            public static byte   MaxAcross(Vector128<byte>   value) { throw null; }
            public static sbyte  MaxAcross(Vector128<sbyte>  value) { throw null; }
            public static ushort MaxAcross(Vector128<ushort> value) { throw null; }
            public static short  MaxAcross(Vector128<short>  value) { throw null; }
            public static uint   MaxAcross(Vector128<uint>   value) { throw null; }
            public static int    MaxAcross(Vector128<int>    value) { throw null; }
            public static ulong  MaxAcross(Vector128<ulong>  value) { throw null; }
            public static long   MaxAcross(Vector128<long>   value) { throw null; }
            public static float  MaxAcross(Vector128<float>  value) { throw null; }
            public static double MaxAcross(Vector128<double> value) { throw null; }

            /// <summary>
            /// Vector min numeric
            /// Corresponds to vector forms of ARM64 FMINNM
            /// </summary>
            public static Vector128<double> MinNumeric(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector min numeric pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 FMINNMP
            /// </summary>
            public static Vector64<float>   MaxNumericPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MaxNumericPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MaxNumericPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector min numeric across
            ///
            /// result = min(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 FMINNMV
            /// </summary>
            public static float  MaxNumericAcross(Vector128<float>  value) { throw null; }

            /// <summary>
            /// Vector min pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 SMINP, UMINP & FMINP
            /// </summary>
            public static Vector128<byte>   MinPairwise(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
            public static Vector128<sbyte>  MinPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<ushort> MinPairwise(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
            public static Vector128<short>  MinPairwise(Vector128<short>  left, Vector128<short>  right) { throw null; }
            public static Vector128<uint>   MinPairwise(Vector128<uint>   left, Vector128<uint>   right) { throw null; }
            public static Vector128<int>    MinPairwise(Vector128<int>    left, Vector128<int>    right) { throw null; }
            public static Vector128<float>  MinPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MinPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector min across
            ///
            /// result = max(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 SMINV, UMINV & FMINV
            /// </summary>
            public static byte   MinAcross(Vector64<byte>    value) { throw null; }
            public static sbyte  MinAcross(Vector64<sbyte>   value) { throw null; }
            public static ushort MinAcross(Vector64<ushort>  value) { throw null; }
            public static short  MinAcross(Vector64<short>   value) { throw null; }
            public static uint   MinAcross(Vector64<uint>    value) { throw null; }
            public static int    MinAcross(Vector64<int>     value) { throw null; }
            public static float  MinAcross(Vector64<float>   value) { throw null; }
            public static byte   MinAcross(Vector128<byte>   value) { throw null; }
            public static sbyte  MinAcross(Vector128<sbyte>  value) { throw null; }
            public static ushort MinAcross(Vector128<ushort> value) { throw null; }
            public static short  MinAcross(Vector128<short>  value) { throw null; }
            public static uint   MinAcross(Vector128<uint>   value) { throw null; }
            public static int    MinAcross(Vector128<int>    value) { throw null; }
            public static float  MinAcross(Vector128<float>  value) { throw null; }
            public static double MinAcross(Vector128<double> value) { throw null; }

            /// <summary>
            /// Vector fused multiply add
            ///
            /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
            ///
            /// Corresponds to vector forms of ARM64 FMLA
            /// </summary>
            public static Vector128<double> FusedMultiplyAdd(Vector128<double> acc, Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector fused multiply add by element
            ///
            /// For each element result[elem] = acc[elem] + left[elem] * right
            ///
            /// Corresponds to vector forms of ARM64 FMLA
            /// </summary>
            public static Vector64<float>   FusedMultiplyAdd(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
            public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }
            public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }

            /// <summary>
            /// Vector fused multiply subtract
            ///
            /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
            ///
            /// Corresponds to vector forms of ARM64 FMLS
            /// </summary>
            public static Vector128<double>  FusedMultiplySubtract(Vector128<double>  acc, Vector128<double>  left, Vector128<double>  right) { throw null; }

            /// <summary>
            /// Vector fused multiply subtract by element
            ///
            /// For each element result[elem] = acc[elem] - left[elem] * right
            ///
            /// Corresponds to vector forms of ARM64 FMLS
            /// </summary>
            public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
            public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }
            public static Vector128<double> FusedMultiplySubtract(Vector128<double> acc, Vector128<double> left, float   right) { throw null; }

            /// <summary>
            /// Vector multiply extend
            ///
            /// For each element result[elem] = left[elem] * right[elem]
            /// Handle extend special cases zero and infinite.  FMULX
            ///
            /// Corresponds to vector forms of ARM64 FMULX
            /// </summary>
            public static Vector64<float>   MultiplyExtend(Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MultiplyExtend(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MultiplyExtend(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector multiply extend by element
            ///
            /// For each element result[elem] = left[elem] * right
            /// Handle extend special cases zero and infinite.  FMULX
            ///
            /// Corresponds to vector forms of ARM64 FMULX
            /// </summary>
            public static Vector64<float>   MultiplyExtend(Vector64<float>   left, float  right) { throw null; }
            public static Vector128<float>  MultiplyExtend(Vector128<float>  left, float  right) { throw null; }
            public static Vector128<double> MultiplyExtend(Vector128<double> left, double right) { throw null; }

            /// Vector reciprocal estimate
            ///
            /// See FRECPE docs
            ///
            /// Corresponds to vector forms of ARM64 FRECPE
            /// </summary>
            public static Vector128<double> ReciprocalEstimate(Vector128<double> value) { throw null; }

            /// <summary>
            /// Vector reciprocal step
            ///
            /// See FRECPS docs
            ///
            /// Corresponds to vector forms of ARM64 FRECPS
            /// </summary>
            public static Vector128<double> ReciprocalStep(Vector128<double> left, Vector128<double> right, byte index) { throw null; }

            /// <summary>
            /// Vector reciprocal square root estimate
            ///
            /// See FRSQRTE docs
            ///
            /// Corresponds to vector forms of ARM64 FRSQRTE
            /// </summary>
            public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> value) { throw null; }

            /// <summary>
            /// Vector reciprocal square root step
            ///
            /// See FRSQRTS docs
            ///
            /// Corresponds to vector forms of ARM64 FRSQRTS
            /// </summary>
            public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> left, Vector128<double> right, byte index) { throw null; }

            /// <summary>
            /// Vector reverse byte bits
            /// Corresponds to vector forms of ARM64 RBIT
            /// </summary>
            public static Vector64<byte>    ReverseElementBits(Vector64<byte>    value) { throw null; }
            public static Vector64<sbyte>   ReverseElementBits(Vector64<sbyte>   value) { throw null; }
            public static Vector128<byte>   ReverseElementBits(Vector128<byte>   value) { throw null; }
            public static Vector128<sbyte>  ReverseElementBits(Vector128<sbyte>  value) { throw null; }
        }
  }
}

@TamarChristinaArm
Copy link
Contributor

alright, that should be the final list.

@tannergooding
Copy link
Member

Thanks @TamarChristinaArm.

I'll give this a look over either tonight or tomorrow and get the top post updated 😄

@BruceForstall
Copy link
Member

@tannergooding What is the next step here for getting this reviewed? Is it ready to go?
cc @TamarChristinaArm @echesakovMSFT @CarolEidt @sdmaclea

@tannergooding
Copy link
Member

What is the next step here for getting this reviewed? Is it ready to go?

I've updated the original post with @TamarChristinaArm's updated surface.

The next step is just ensuring we get a dedicated review session with @terrajobst. I'll bring it up again today and see if we can drive down a date.

@tannergooding
Copy link
Member

I've asked a question internally about MLA and will update this tomorrow if needed when I get an answer.

@TamarChristinaArm, was this determining if VMLA (floating-point) for arm32 is a fused operation? I believe that ended up being my only question and if that dictated them needing to be separate.

If so, it doesn't look like A32 has its own "fused" operation and we should remove those APIs from the general list (in the proposed surface, they look duplicated in both AdvSimd and AdvSimd.Arm64).

@TamarChristinaArm
Copy link
Contributor

TamarChristinaArm commented Oct 16, 2019

I've asked a question internally about MLA and will update this tomorrow if needed when I get an answer.

@TamarChristinaArm, was this determining if VMLA (floating-point) for arm32 is a fused operation? I believe that ended up being my only question and if that dictated them needing to be separate.

If so, it doesn't look like A32 has its own "fused" operation and we should remove those APIs from the general list (in the proposed surface, they look duplicated in both AdvSimd and AdvSimd.Arm64).

That's where indeed things got a bit confusing, on A32 the fused version of the instructions are called VFMA, but it doesn't have a fused by element version of it.

A64 however only has fused MLA, and so doesn't have the non-fused variant, but does have a by element version of the fused variant.

This is why the split in definition above. The MLA is still useful on A32 if you don't care about the rounding because it does have a by element version then.

@echesakov
Copy link
Contributor

I will take this issue if no one is working on it.

@echesakov echesakov self-assigned this Jan 30, 2020
@TamarChristinaArm
Copy link
Contributor

I'm not working on it, AddAcross and ReverseElementBits I already did since those intersected with my list but didn't work on the rest.

@echesakov
Copy link
Contributor

echesakov commented Jan 30, 2020

@TamarChristinaArm @tannergooding @CarolEidt

I have a question - for MaxNumericPairwise and MinNumericPairwise there is no overloads that operate on one vector (i.e. Vector64<float> MaxNumericPairwise(Vector64<float> value)) even though there is a c++ intrinsic float32_t vpmaxnms_f32 (float32x2_t a) that maps to FMAXNMP Sd,Vn.2S instruction. Same for the Vector128<double> MaxNumericPairwise(Vector128<double> value).

Is it intentional?

It might look odd if we had MaxNumericPairwise that has overloads with 1 and 2 operands. Should we instead add MaxNumericAcross(Vector64<float> value) that maps to FMAXNMP Sd,Vn.2S?

I think the same could be done for MaxAcross(Vector64<float> value) and FMAXP Sd,Vn.2S

@CarolEidt
Copy link
Contributor

To me it seems like these should have the same name, and have overloads with one or two operands, since the fundamental operation is the same. It's a bit weird because the operation is always pairwise, but the number of operations & results isn't always consistent for the one operand and two operand case, if I read it correctly (i.e the one operand form always produces a single result, while the two-operand case always operates on each pair in the concatenated vector, but that's simply a characteristic of the architecture that we're exposing.

@TamarChristinaArm
Copy link
Contributor

@echesakovMSFT Thanks for reminding me, I was waiting for the API review to ask, but yes, so I personally think it the pair single register versions should instead by under the reduction intrinsics.

So instead of having a single register MaxNumericPairwise(Vector64<float> value) it should be under MaxAcross.

In C we defined them under vmaxv (Which I didn't here waiting to solicit feedback) and we put them under a new made up intrinsics name since we couldn't overload it. To me it seems more natural to add these single register pairwise operations as reductions.

Like @CarolEidt mentioned the operations aren't exactly the same if we overload MaxNumericPairwise, and also I think we'd be breaking the convention we've used until now for the operations working on the scalar part of the SIMD file. So shouldn't the single register version be MaxNumericPairwiseScalar then?

Or do we want both like in C? overload the reductions and the pair instructions?

@tannergooding
Copy link
Member

So shouldn't the single register version be MaxNumericPairwiseScalar then?

This is my understanding of the conventions we have followed thus-far. The instructions is FMAXNMP (scalar) and functionally it is a scalar (for which we have always used the Scalar postfix in the name).

The confusion likely comes because, so far, scalar just means "lowest element" and so for something like Vector64<float> AddScalar(Vector64<float> lhs, Vector64<float> rhs), it adds Element 0 from lhs with Element 0 from rhs and returns it in Element 0 of the result.

In this case, the signature would be Vector64<float> MaxNumericPairwiseScalar(Vector64<float> value) which would operate on the scalar pair (Element 0 and Element 1) in value and return it in Element 0 of the result.
So, it is still scalar, it is just that we are "pair-wise" rather than "element-wise"

@tannergooding
Copy link
Member

tannergooding commented Jan 31, 2020

Also, to this note:

Vector64<float> AddScalar(Vector64<float> lhs, Vector64<float> rhs)

I believe the proposed signatures for things like the following are incorrect:

public static byte   MinAcross(Vector64<byte>    value) { throw null; }
public static sbyte  MinAcross(Vector64<sbyte>   value) { throw null; }

The instructions return the result in a SIMD&FP register, not in a general purpose register. So the result should remain Vector64<T>. The purpose of the scalar variants is to remove the need to continuously transition between "scalar" code and "vector" code. If it is an intrinsic that operates on or returns a SIMD register (even if it only treats that value as a scalar), it takes and returns a Vector*<T>.

  • On x86, this principal avoided the chance for the upper bits from being trashed or lost
  • On ARM, (since upper bits are zeroed) it just simplifies the overall logic and avoids cases like "you must go back to float for simple ops (like addition) but must use HWIntrinsics for other things (like reciprocal) (and it maintains consistency with the x86 intrinsics from an API perspective)

@terrajobst terrajobst added api-approved API was approved in API review, it can be implemented and removed api-ready-for-review labels Feb 18, 2020
@terrajobst
Copy link
Member

terrajobst commented Feb 18, 2020

  • Check whether MaxPairwise/MinPairwise should also include forms for Vector128
  • Should the arguments for ReciprocalStep and ReciprocalSquareRootStep be something more specific than left and right?
  • Looks like ReverseElementBytes needs more work, as well as clarity on support
  • We skipped Arm32 because it's not going to be implemented for .NET 5
  • ExtractVector shouldn't have overloads for float and double, it could end up silently modifying/normalizing/corrupting the floating point types
  • MaxNumericPairwiseScalar should be MaxNumbercPairwiseScalar. Some folks raised concerns around PairwiseScalar being confusing, but it matches the ISA name and we can't think of a better name
  • We didn't review all the APIs (I commented the ones below). Tanner will see whether they are just applying a pattern or a net-new APIs, in which case we'll take another look.
namespace System.Runtime.Intrinsics.Arm
{
    public partial class AdvSimd
    {
        /// <summary>
        /// Vector CompareGreaterThanOrEqual
        /// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
        /// Corresponds to vector forms of ARM64 FACGE
        /// </summary>
        public static Vector64<float>   AbsoluteCompareGreaterThanOrEqual(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  AbsoluteCompareGreaterThanOrEqual(Vector128<float>  left, Vector128<float>  right);

        /// <summary>
        /// Vector CompareGreaterThan
        ///
        /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
        ///
        /// Corresponds to vector forms of ARM64 FACGT
        /// </summary>
        public static Vector64<float>   AbsoluteCompareGreaterThan(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  AbsoluteCompareGreaterThan(Vector128<float>  left, Vector128<float>  right);

        /// <summary>
        /// Vector absolute difference
        /// Corresponds to vector forms of ARM64 SABD, UABD, and FABD
        /// </summary>
        public static Vector64<byte>    AbsoluteDifference(Vector64<byte>    left, Vector64<byte>    right);
        public static Vector64<byte>    AbsoluteDifference(Vector64<sbyte>   left, Vector64<sbyte>   right);
        public static Vector64<ushort>  AbsoluteDifference(Vector64<ushort>  left, Vector64<ushort>  right);
        public static Vector64<ushort>  AbsoluteDifference(Vector64<short>   left, Vector64<short>   right);
        public static Vector64<uint>    AbsoluteDifference(Vector64<uint>    left, Vector64<uint>    right);
        public static Vector64<uint>    AbsoluteDifference(Vector64<int>     left, Vector64<int>     right);
        public static Vector64<float>   AbsoluteDifference(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<byte>   AbsoluteDifference(Vector128<byte>   left, Vector128<byte>   right);
        public static Vector128<byte>   AbsoluteDifference(Vector128<sbyte>  left, Vector128<sbyte>  right);
        public static Vector128<ushort> AbsoluteDifference(Vector128<ushort> left, Vector128<ushort> right);
        public static Vector128<ushort> AbsoluteDifference(Vector128<short>  left, Vector128<short>  right);
        public static Vector128<uint>   AbsoluteDifference(Vector128<uint>   left, Vector128<uint>   right);
        public static Vector128<uint>   AbsoluteDifference(Vector128<int>    left, Vector128<int>    right);
        public static Vector128<float>  AbsoluteDifference(Vector128<float>  left, Vector128<float>  right);

        /// <summary>
        /// Vector absolute difference add
        ///
        /// For each element result[elem] = acc[elem] + | left[elem] - right[elem] |
        ///
        /// Corresponds to vector forms of ARM64 SABA, UABA
        /// </summary>
        public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<byte>    addend, Vector64<byte>    left, Vector64<byte>    right);
        public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<sbyte>   addend, Vector64<sbyte>   left, Vector64<sbyte>   right);
        public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<ushort>  addend, Vector64<ushort>  left, Vector64<ushort>  right);
        public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<short>   addend, Vector64<short>   left, Vector64<short>   right);
        public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<uint>    addend, Vector64<uint>    left, Vector64<uint>    right);
        public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<int>     addend, Vector64<int>     left, Vector64<int>     right);
        public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<byte>   addend, Vector128<byte>   left, Vector128<byte>   right);
        public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<sbyte>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right);
        public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right);
        public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<short>  addend, Vector128<short>  left, Vector128<short>  right);
        public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<uint>   addend, Vector128<uint>   left, Vector128<uint>   right);
        public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<int>    addend, Vector128<int>    left, Vector128<int>    right);

        /// <summary>
        /// Vector add pairwise
        /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
        /// Corresponds to vector forms of ARM64 ADDP, and FADDP
        /// </summary>
        public static Vector64<byte>   AddPairwise(Vector64<byte>  left, Vector64<byte>  right) ;
        public static Vector64<sbyte>  AddPairwise(Vector64<sbyte>  left, Vector64<sbyte>  right) ;
        public static Vector64<ushort> AddPairwise(Vector64<ushort>  left, Vector64<ushort>  right) ;
        public static Vector64<short>  AddPairwise(Vector64<short>  left, Vector64<short>  right) ;
        public static Vector64<int>    AddPairwise(Vector64<int>  left, Vector64<int>  right) ;
        public static Vector64<uint>   AddPairwise(Vector64<uint>  left, Vector64<uint>  right) ;
        public static Vector64<float>  AddPairwise(Vector64<float>  left, Vector64<float>  right) ;

        /// <summary>
        /// Vector extract from pair of vectors
        /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
        ///
        /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
        ///
        /// Corresponds to vector forms of ARM64 EXT
        /// </summary>
        public static Vector64<byte>   ExtractVector64(Vector64<byte>  upper, Vector64<byte>  lower, byte byteIndex);
        public static Vector64<sbyte>  ExtractVector64(Vector64<sbyte>  upper, Vector64<sbyte>  lower, byte byteIndex);
        public static Vector64<short>  ExtractVector64(Vector64<short>  upper, Vector64<short>  lower, byte byteIndex);
        public static Vector64<ushort> ExtractVector64(Vector64<ushort>  upper, Vector64<ushort>  lower, byte byteIndex);
        public static Vector64<int>    ExtractVector64(Vector64<int>  upper, Vector64<int>  lower, byte byteIndex);
        public static Vector64<uint>   ExtractVector64(Vector64<uint>  upper, Vector64<uint>  lower, byte byteIndex);

        public static Vector128<byte>   ExtractVector128(Vector128<byte> upper, Vector128<byte> lower, byte byteIndex);
        public static Vector128<sbyte>  ExtractVector128(Vector128<sbyte> upper, Vector128<sbyte> lower, byte byteIndex);
        public static Vector128<short>  ExtractVector128(Vector128<short> upper, Vector128<short> lower, byte byteIndex);
        public static Vector128<ushort> ExtractVector128(Vector128<ushort> upper, Vector128<ushort> lower, byte byteIndex);
        public static Vector128<int>    ExtractVector128(Vector128<int> upper, Vector128<int> lower, byte byteIndex);
        public static Vector128<uint>   ExtractVector128(Vector128<uint> upper, Vector128<uint> lower, byte byteIndex);
        public static Vector128<long>   ExtractVector128(Vector128<long> upper, Vector128<long> lower, byte byteIndex);
        public static Vector128<ulong>  ExtractVector128(Vector128<ulong> upper, Vector128<ulong> lower, byte byteIndex);
        public static Vector128<float>  ExtractVector128(Vector128<float> upper, Vector128<float> lower, byte byteIndex);
        public static Vector128<double> ExtractVector128(Vector128<double> upper, Vector128<double> lower, byte byteIndex);

        /// <summary>
        /// Vector max numeric
        /// Corresponds to vector forms of ARM64 FMAXNM
        /// </summary>
        public static Vector64<float>   MaxNumber(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  MaxNumber(Vector128<float>  left, Vector128<float>  right);

        /// <summary>
        /// Vector max pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 SMAXP, UMAXP, and FMAXP
        /// </summary>
        public static Vector64<byte>    MaxPairwise(Vector64<byte>    left, Vector64<byte>    right);
        public static Vector64<sbyte>   MaxPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right);
        public static Vector64<ushort>  MaxPairwise(Vector64<ushort>  left, Vector64<ushort>  right);
        public static Vector64<short>   MaxPairwise(Vector64<short>   left, Vector64<short>   right);
        public static Vector64<uint>    MaxPairwise(Vector64<uint>    left, Vector64<uint>    right);
        public static Vector64<int>     MaxPairwise(Vector64<int>     left, Vector64<int>     right);
        public static Vector64<float>   MaxPairwise(Vector64<float>   left, Vector64<float>   right);

        /// <summary>
        /// Vector min numeric
        /// Corresponds to vector forms of ARM64 FMINNM
        /// </summary>
        public static Vector64<float>   MinNumber(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  MinNumber(Vector128<float>  left, Vector128<float>  right);

        /// <summary>
        /// Vector min pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 SMINP, UMINP, and FMINP
        /// </summary>
        public static Vector64<byte>    MinPairwise(Vector64<byte>    left, Vector64<byte>    right);
        public static Vector64<sbyte>   MinPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right);
        public static Vector64<ushort>  MinPairwise(Vector64<ushort>  left, Vector64<ushort>  right);
        public static Vector64<short>   MinPairwise(Vector64<short>   left, Vector64<short>   right);
        public static Vector64<uint>    MinPairwise(Vector64<uint>    left, Vector64<uint>    right);
        public static Vector64<int>     MinPairwise(Vector64<int>     left, Vector64<int>     right);
        public static Vector64<float>   MinPairwise(Vector64<float>   left, Vector64<float>   right);

        /// <summary>
        /// Vector multiply add
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 MLA
        /// </summary>
        public static Vector64<byte>    MultiplyAdd(Vector64<byte>    addend, Vector64<byte>    left, Vector64<byte>    right);
        public static Vector64<sbyte>   MultiplyAdd(Vector64<sbyte>   addend, Vector64<sbyte>   left, Vector64<sbyte>   right);
        public static Vector64<ushort>  MultiplyAdd(Vector64<ushort>  addend, Vector64<ushort>  left, Vector64<ushort>  right);
        public static Vector64<short>   MultiplyAdd(Vector64<short>   addend, Vector64<short>   left, Vector64<short>   right);
        public static Vector64<uint>    MultiplyAdd(Vector64<uint>    addend, Vector64<uint>    left, Vector64<uint>    right);
        public static Vector64<int>     MultiplyAdd(Vector64<int>     addend, Vector64<int>     left, Vector64<int>     right);
        public static Vector128<byte>   MultiplyAdd(Vector128<byte>   addend, Vector128<byte>   left, Vector128<byte>   right);
        public static Vector128<sbyte>  MultiplyAdd(Vector128<sbyte>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right);
        public static Vector128<ushort> MultiplyAdd(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right);
        public static Vector128<short>  MultiplyAdd(Vector128<short>  addend, Vector128<short>  left, Vector128<short>  right);
        public static Vector128<uint>   MultiplyAdd(Vector128<uint>   addend, Vector128<uint>   left, Vector128<uint>   right);
        public static Vector128<int>    MultiplyAdd(Vector128<int>    addend, Vector128<int>    left, Vector128<int>    right);

        /// <summary>
        /// Vector multiply add by element
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 MLA
        /// </summary>
        public static Vector64<byte>    MultiplyAddBySelectedScalar(Vector64<byte>    addend, Vector64<byte>    left, Vector64<byte>    right, byte rightIndex);
        public static Vector64<sbyte>   MultiplyAddBySelectedScalar(Vector64<sbyte>   addend, Vector64<sbyte>   left, Vector64<sbyte>   right, byte rightIndex);
        public static Vector64<ushort>  MultiplyAddBySelectedScalar(Vector64<ushort>  addend, Vector64<ushort>  left, Vector64<ushort>  right, byte rightIndex);
        public static Vector64<short>   MultiplyAddBySelectedScalar(Vector64<short>   addend, Vector64<short>   left, Vector64<short>   right, byte rightIndex);
        public static Vector64<uint>    MultiplyAddBySelectedScalar(Vector64<uint>    addend, Vector64<uint>    left, Vector64<uint>    right, byte rightIndex);
        public static Vector64<int>     MultiplyAddBySelectedScalar(Vector64<int>     addend, Vector64<int>     left, Vector64<int>     right, byte rightIndex);
        public static Vector128<byte>   MultiplyAddBySelectedScalar(Vector128<byte>   addend, Vector128<byte>   left, Vector128<byte>   right, byte rightIndex);
        public static Vector128<sbyte>  MultiplyAddBySelectedScalar(Vector128<sbyte>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right, byte rightIndex);
        public static Vector128<ushort> MultiplyAddBySelectedScalar(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
        public static Vector128<short>  MultiplyAddBySelectedScalar(Vector128<short>  addend, Vector128<short>  left, Vector128<short>  right, byte rightIndex);
        public static Vector128<uint>   MultiplyAddBySelectedScalar(Vector128<uint>   addend, Vector128<uint>   left, Vector128<uint>   right, byte rightIndex);
        public static Vector128<int>    MultiplyAddBySelectedScalar(Vector128<int>    addend, Vector128<int>    left, Vector128<int>    right, byte rightIndex);

        /// <summary>
        /// Vector multiply subtract
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 MLS
        /// </summary>
        public static Vector64<byte>    MultiplySubtract(Vector64<byte>    minuend, Vector64<byte>    left, Vector64<byte>    right);
        public static Vector64<sbyte>   MultiplySubtract(Vector64<sbyte>   minuend, Vector64<sbyte>   left, Vector64<sbyte>   right);
        public static Vector64<ushort>  MultiplySubtract(Vector64<ushort>  minuend, Vector64<ushort>  left, Vector64<ushort>  right);
        public static Vector64<short>   MultiplySubtract(Vector64<short>   minuend, Vector64<short>   left, Vector64<short>   right);
        public static Vector64<uint>    MultiplySubtract(Vector64<uint>    minuend, Vector64<uint>    left, Vector64<uint>    right);
        public static Vector64<int>     MultiplySubtract(Vector64<int>     minuend, Vector64<int>     left, Vector64<int>     right);
        public static Vector128<byte>   MultiplySubtract(Vector128<byte>   minuend, Vector128<byte>   left, Vector128<byte>   right);
        public static Vector128<sbyte>  MultiplySubtract(Vector128<sbyte>  minuend, Vector128<sbyte>  left, Vector128<sbyte>  right);
        public static Vector128<ushort> MultiplySubtract(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right);
        public static Vector128<short>  MultiplySubtract(Vector128<short>  minuend, Vector128<short>  left, Vector128<short>  right);
        public static Vector128<uint>   MultiplySubtract(Vector128<uint>   minuend, Vector128<uint>   left, Vector128<uint>   right);
        public static Vector128<int>    MultiplySubtract(Vector128<int>    minuend, Vector128<int>    left, Vector128<int>    right);

        /// <summary>
        /// Vector multiply subtract by element
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 MLS
        /// </summary>
        public static Vector64<byte>    MultiplySubtractBySelectedScalar(Vector64<byte>    minuend, Vector64<byte>    left, Vector64<byte>    right, byte rightIndex);
        public static Vector64<sbyte>   MultiplySubtractBySelectedScalar(Vector64<sbyte>   minuend, Vector64<sbyte>   left, Vector64<sbyte>   right, byte rightIndex);
        public static Vector64<ushort>  MultiplySubtractBySelectedScalar(Vector64<ushort>  minuend, Vector64<ushort>  left, Vector64<ushort>  right, byte rightIndex);
        public static Vector64<short>   MultiplySubtractBySelectedScalar(Vector64<short>   minuend, Vector64<short>   left, Vector64<short>   right, byte rightIndex);
        public static Vector64<uint>    MultiplySubtractBySelectedScalar(Vector64<uint>    minuend, Vector64<uint>    left, Vector64<uint>    right, byte rightIndex);
        public static Vector64<int>     MultiplySubtractBySelectedScalar(Vector64<int>     minuend, Vector64<int>     left, Vector64<int>     right, byte rightIndex);
        public static Vector128<byte>   MultiplySubtractBySelectedScalar(Vector128<byte>   minuend, Vector128<byte>   left, Vector128<byte>   right, byte rightIndex);
        public static Vector128<sbyte>  MultiplySubtractBySelectedScalar(Vector128<sbyte>  minuend, Vector128<sbyte>  left, Vector128<sbyte>  right, byte rightIndex);
        public static Vector128<ushort> MultiplySubtractBySelectedScalar(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
        public static Vector128<short>  MultiplySubtractBySelectedScalar(Vector128<short>  minuend, Vector128<short>  left, Vector128<short>  right, byte rightIndex);
        public static Vector128<uint>   MultiplySubtractBySelectedScalar(Vector128<uint>   minuend, Vector128<uint>   left, Vector128<uint>   right, byte rightIndex);
        public static Vector128<int>    MultiplySubtractBySelectedScalar(Vector128<int>    minuend, Vector128<int>    left, Vector128<int>    right, byte rightIndex);

        /// <summary>
        /// Vector fused multiply add
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 FMLA
        /// </summary>
        public static Vector64<float>   FusedMultiplyAdd(Vector64<float>   addend, Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  addend, Vector128<float>  left, Vector128<float>  right);

        /// <summary>
        /// Vector fused multiply subtract
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 FMLS
        /// </summary>
        public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   minuend, Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  minuend, Vector128<float>  left, Vector128<float>  right);


        /// <summary>
        /// Vector polynomial multiply
        /// Corresponds to vector forms of ARM64 PMUL
        /// </summary>
        public static Vector64<byte>    PolynomialMultiply(Vector64<byte>    left, Vector64<byte>    right);
        public static Vector64<sbyte>   PolynomialMultiply(Vector64<sbyte>   left, Vector64<sbyte>   right);
        public static Vector128<byte>   PolynomialMultiply(Vector128<byte>   left, Vector128<byte>   right);
        public static Vector128<sbyte>  PolynomialMultiply(Vector128<sbyte>  left, Vector128<sbyte>  right);

        /// Vector reciprocal estimate
        ///
        /// See FRECPE docs
        ///
        /// Corresponds to vector forms of ARM64 FRECPE
        /// </summary>
        public static Vector64<float>   ReciprocalEstimate(Vector64<float>   value);
        public static Vector128<float>  ReciprocalEstimate(Vector128<float>  value);

        /// <summary>
        /// Vector reciprocal step
        ///
        /// See FRECPS docs
        ///
        /// Corresponds to vector forms of ARM64 FRECPS
        /// </summary>
        public static Vector64<float>   ReciprocalStep(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  ReciprocalStep(Vector128<float>  left, Vector128<float>  right);

        /// <summary>
        /// Vector reciprocal square root estimate
        ///
        /// See FRSQRTE docs
        ///
        /// Corresponds to vector forms of ARM64 FRSQRTE
        /// </summary>
        public static Vector64<float>   ReciprocalSquareRootEstimate(Vector64<float>   value);
        public static Vector128<float>  ReciprocalSquareRootEstimate(Vector128<float>  value);

        /// <summary>
        /// Vector reciprocal square root step
        ///
        /// See FRSQRTS docs
        ///
        /// Corresponds to vector forms of ARM64 FRSQRTS
        /// </summary>
        public static Vector64<float>   ReciprocalSquareRootStep(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  ReciprocalSquareRootStep(Vector128<float>  left, Vector128<float>  right);

        public partial class Arm64
        {
            /// <summary>
            /// Vector CompareGreaterThanOrEqual
            /// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
            /// Corresponds to vector forms of ARM64 FACGE
            /// </summary>
            public static Vector128<double> AbsoluteCompareGreaterThanOrEqual(Vector128<double> left, Vector128<double> right);

            /// <summary>
            /// Vector CompareGreaterThan
            ///
            /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
            ///
            /// Corresponds to vector forms of ARM64 FACGT
            /// </summary>
            public static Vector128<double> AbsoluteCompareGreaterThan(Vector128<double> left, Vector128<double> right);

            /// <summary>
            /// Vector absolute difference
            /// Corresponds to vector forms of ARM64 SABD, UABD, and FABD
            /// </summary>
            public static Vector128<double> AbsoluteDifference(Vector128<double> left, Vector128<double> right);

            /// <summary>
            /// Vector add pairwise
            /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
            /// Corresponds to vector forms of ARM64 ADDP, and FADDP
            /// </summary>
            public static Vector128<byte>   AddPairwise(Vector128<byte>   left, Vector128<byte>   right);
            public static Vector128<sbyte>  AddPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right);
            public static Vector128<ushort> AddPairwise(Vector128<ushort> left, Vector128<ushort> right);
            public static Vector128<short>  AddPairwise(Vector128<short>  left, Vector128<short>  right);
            public static Vector128<uint>   AddPairwise(Vector128<uint>   left, Vector128<uint>   right);
            public static Vector128<int>    AddPairwise(Vector128<int>    left, Vector128<int>    right);
            public static Vector128<long>   AddPairwise(Vector128<long>   left, Vector128<long>   right);
            public static Vector128<ulong>  AddPairwise(Vector128<ulong>  left, Vector128<ulong>  right);
            public static Vector128<float>  AddPairwise(Vector128<float>  left, Vector128<float>  right);
            public static Vector128<double> AddPairwise(Vector128<double> left, Vector128<double> right);

            /// <summary>
            /// Vector extract from pair of vectors
            /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
            ///
            /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
            ///
            /// Corresponds to vector forms of ARM64 EXT
            /// </summary>
            public static Vector128<double> ExtractVector(Vector128<double> left, Vector128<double> right, byte index);

            /// <summary>
            /// Vector add across vector elements
            /// Corresponds to vector forms of ARM64 ADDV
            /// </summary>
            public static Vector64<byte>   AddAcross(Vector64<byte>    value);
            public static Vector64<sbyte>  AddAcross(Vector64<sbyte>   value);
            public static Vector64<ushort> AddAcross(Vector64<ushort>  value);
            public static Vector64<short>  AddAcross(Vector64<short>   value);
            public static Vector64<uint>   AddAcross(Vector64<uint>    value);
            public static Vector64<int>    AddAcross(Vector64<int>     value);
            public static Vector64<byte>   AddAcross(Vector128<byte>   value);
            public static Vector64<sbyte>  AddAcross(Vector128<sbyte>  value);
            public static Vector64<ushort> AddAcross(Vector128<ushort> value);
            public static Vector64<short>  AddAcross(Vector128<short>  value);
            public static Vector64<uint>   AddAcross(Vector128<uint>   value);
            public static Vector64<int>    AddAcross(Vector128<int>    value);

            /// <summary>
            /// Vector max numeric
            /// Corresponds to vector forms of ARM64 FMAXNM
            /// </summary>
            public static Vector128<double> MaxNumber(Vector128<double> left, Vector128<double> right);

            /// <summary>
            /// Vector max numeric pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 FMAXNMP
            /// </summary>
            public static Vector64<float>   MaxNumberPairwise(Vector64<float>   left, Vector64<float>   right);
            public static Vector128<float>  MaxNumberPairwise(Vector128<float>  left, Vector128<float>  right);
            public static Vector128<double> MaxNumberPairwise(Vector128<double> left, Vector128<double> right);

            /// <summary>
            /// Vector max numeric across
            ///
            /// result = max(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 FMAXNMV
            /// </summary>
            public static Vector64<float> MaxNumberAcross(Vector128<float>  value);

            /// <summary>
            /// Vector max pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 SMAXP, UMAXP, and FMAXP
            /// </summary>
            public static Vector128<byte>   MaxPairwise(Vector128<byte>   left, Vector128<byte>   right);
            public static Vector128<sbyte>  MaxPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right);
            public static Vector128<ushort> MaxPairwise(Vector128<ushort> left, Vector128<ushort> right);
            public static Vector128<short>  MaxPairwise(Vector128<short>  left, Vector128<short>  right);
            public static Vector128<uint>   MaxPairwise(Vector128<uint>   left, Vector128<uint>   right);
            public static Vector128<int>    MaxPairwise(Vector128<int>    left, Vector128<int>    right);
            public static Vector128<float>  MaxPairwise(Vector128<float>  left, Vector128<float>  right);
            public static Vector128<double> MaxPairwise(Vector128<double> left, Vector128<double> right);

            /// <summary>
            /// Vector max across
            ///
            /// result = max(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 SMAXV, UMAXV, and FMAXV
            /// </summary>
            public static Vector64<byte>   MaxAcross(Vector64<byte>    value);
            public static Vector64<sbyte>  MaxAcross(Vector64<sbyte>   value);
            public static Vector64<ushort> MaxAcross(Vector64<ushort>  value);
            public static Vector64<short>  MaxAcross(Vector64<short>   value);
            public static Vector64<uint>   MaxAcross(Vector64<uint>    value);
            public static Vector64<int>    MaxAcross(Vector64<int>     value);
            public static Vector64<float>  MaxAcross(Vector64<float>   value);
            public static Vector64<byte>   MaxAcross(Vector128<byte>   value);
            public static Vector64<sbyte>  MaxAcross(Vector128<sbyte>  value);
            public static Vector64<ushort> MaxAcross(Vector128<ushort> value);
            public static Vector64<short>  MaxAcross(Vector128<short>  value);
            public static Vector64<uint>   MaxAcross(Vector128<uint>   value);
            public static Vector64<int>    MaxAcross(Vector128<int>    value);
            public static Vector64<ulong>  MaxAcross(Vector128<ulong>  value);
            public static Vector64<long>   MaxAcross(Vector128<long>   value);
            public static Vector64<float>  MaxAcross(Vector128<float>  value);
            public static Vector64<double> MaxAcross(Vector128<double> value);

// Not reviewed:
//
//            /// <summary>
//            /// Vector min numeric
//            /// Corresponds to vector forms of ARM64 FMINNM
//            /// </summary>
//            public static Vector128<double> MinNumber(Vector128<double> left, Vector128<double> right);
//
//            /// <summary>
//            /// Vector min numeric pairwise
//            ///
//            /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
//            ///
//            /// Corresponds to vector forms of ARM64 FMINNMP
//            /// </summary>
//            public static Vector64<float>   MinNumberPairwise(Vector64<float>   left, Vector64<float>   right);
//            public static Vector128<float>  MinNumberPairwise(Vector128<float>  left, Vector128<float>  right);
//            public static Vector128<double> MinNumberPairwise(Vector128<double> left, Vector128<double> right);
//
//            /// <summary>
//            /// Vector min numeric across
//            ///
//            /// result = min(value[0], ... , value[length -1])
//            ///
//            /// Corresponds to vector forms of ARM64 FMINNMV
//            /// </summary>
//            public static float  MinNumberAcross(Vector128<float>  value);
//
//            /// <summary>
//            /// Vector min pairwise
//            ///
//            /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
//            ///
//            /// Corresponds to vector forms of ARM64 SMINP, UMINP, and FMINP
//            /// </summary>
//            public static Vector128<byte>   MinPairwise(Vector128<byte>   left, Vector128<byte>   right);
//            public static Vector128<sbyte>  MinPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right);
//            public static Vector128<ushort> MinPairwise(Vector128<ushort> left, Vector128<ushort> right);
//            public static Vector128<short>  MinPairwise(Vector128<short>  left, Vector128<short>  right);
//            public static Vector128<uint>   MinPairwise(Vector128<uint>   left, Vector128<uint>   right);
//            public static Vector128<int>    MinPairwise(Vector128<int>    left, Vector128<int>    right);
//            public static Vector128<float>  MinPairwise(Vector128<float>  left, Vector128<float>  right);
//            public static Vector128<double> MinPairwise(Vector128<double> left, Vector128<double> right);
//
//            /// <summary>
//            /// Vector min across
//            ///
//            /// result = max(value[0], ... , value[length -1])
//            ///
//            /// Corresponds to vector forms of ARM64 SMINV, UMINV, and FMINV
//            /// </summary>
//            public static byte   MinAcross(Vector64<byte>    value);
//            public static sbyte  MinAcross(Vector64<sbyte>   value);
//            public static ushort MinAcross(Vector64<ushort>  value);
//            public static short  MinAcross(Vector64<short>   value);
//            public static uint   MinAcross(Vector64<uint>    value);
//            public static int    MinAcross(Vector64<int>     value);
//            public static float  MinAcross(Vector64<float>   value);
//            public static byte   MinAcross(Vector128<byte>   value);
//            public static sbyte  MinAcross(Vector128<sbyte>  value);
//            public static ushort MinAcross(Vector128<ushort> value);
//            public static short  MinAcross(Vector128<short>  value);
//            public static uint   MinAcross(Vector128<uint>   value);
//            public static int    MinAcross(Vector128<int>    value);
//            public static float  MinAcross(Vector128<float>  value);
//            public static double MinAcross(Vector128<double> value);
//
//            /// <summary>
//            /// Vector fused multiply add
//            ///
//            /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
//            ///
//            /// Corresponds to vector forms of ARM64 FMLA
//            /// </summary>
//            public static Vector128<double> FusedMultiplyAdd(Vector128<double> acc, Vector128<double> left, Vector128<double> right);
//
//            /// <summary>
//            /// Vector fused multiply add by element
//            ///
//            /// For each element result[elem] = acc[elem] + left[elem] * right
//            ///
//            /// Corresponds to vector forms of ARM64 FMLA
//            /// </summary>
//            public static Vector64<float>   FusedMultiplyAdd(Vector64<float>   acc, Vector64<float>   left, float   right);
//            public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  acc, Vector128<float>  left, float   right);
//            public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  acc, Vector128<float>  left, float   right);
//
//            /// <summary>
//            /// Vector fused multiply subtract
//            ///
//            /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
//            ///
//            /// Corresponds to vector forms of ARM64 FMLS
//            /// </summary>
//            public static Vector128<double>  FusedMultiplySubtract(Vector128<double>  acc, Vector128<double>  left, Vector128<double>  right);
//
//            /// <summary>
//            /// Vector fused multiply subtract by element
//            ///
//            /// For each element result[elem] = acc[elem] - left[elem] * right
//            ///
//            /// Corresponds to vector forms of ARM64 FMLS
//            /// </summary>
//            public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   acc, Vector64<float>   left, float   right);
//            public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  acc, Vector128<float>  left, float   right);
//            public static Vector128<double> FusedMultiplySubtract(Vector128<double> acc, Vector128<double> left, float   right);
//
//            /// <summary>
//            /// Vector multiply extend
//            ///
//            /// For each element result[elem] = left[elem] * right[elem]
//            /// Handle extend special cases zero and infinite.  FMULX
//            ///
//            /// Corresponds to vector forms of ARM64 FMULX
//            /// </summary>
//            public static Vector64<float>   MultiplyExtend(Vector64<float>   left, Vector64<float>   right);
//            public static Vector128<float>  MultiplyExtend(Vector128<float>  left, Vector128<float>  right);
//            public static Vector128<double> MultiplyExtend(Vector128<double> left, Vector128<double> right);
//
//            /// <summary>
//            /// Vector multiply extend by element
//            ///
//            /// For each element result[elem] = left[elem] * right
//            /// Handle extend special cases zero and infinite.  FMULX
//            ///
//            /// Corresponds to vector forms of ARM64 FMULX
//            /// </summary>
//            public static Vector64<float>   MultiplyExtend(Vector64<float>   left, float  right);
//            public static Vector128<float>  MultiplyExtend(Vector128<float>  left, float  right);
//            public static Vector128<double> MultiplyExtend(Vector128<double> left, double right);
//
//            /// Vector reciprocal estimate
//            ///
//            /// See FRECPE docs
//            ///
//            /// Corresponds to vector forms of ARM64 FRECPE
//            /// </summary>
//            public static Vector128<double> ReciprocalEstimate(Vector128<double> value);
//
//            /// <summary>
//            /// Vector reciprocal step
//            ///
//            /// See FRECPS docs
//            ///
//            /// Corresponds to vector forms of ARM64 FRECPS
//            /// </summary>
//            public static Vector128<double> ReciprocalStep(Vector128<double> left, Vector128<double> right, byte index);
//
//            /// <summary>
//            /// Vector reciprocal square root estimate
//            ///
//            /// See FRSQRTE docs
//            ///
//            /// Corresponds to vector forms of ARM64 FRSQRTE
//            /// </summary>
//            public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> value);
//
//            /// <summary>
//            /// Vector reciprocal square root step
//            ///
//            /// See FRSQRTS docs
//            ///
//            /// Corresponds to vector forms of ARM64 FRSQRTS
//            /// </summary>
//            public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> left, Vector128<double> right, byte index);
//
//            /// <summary>
//            /// Vector reverse byte bits
//            /// Corresponds to vector forms of ARM64 RBIT
//            /// </summary>
//            public static Vector64<byte>    ReverseElementBits(Vector64<byte>    value);
//            public static Vector64<sbyte>   ReverseElementBits(Vector64<sbyte>   value);
//            public static Vector128<byte>   ReverseElementBits(Vector128<byte>   value);
//            public static Vector128<sbyte>  ReverseElementBits(Vector128<sbyte>  value);
        }
  }
}

@terrajobst
Copy link
Member

  • We finished the review today. This comment repeast all the APIs and comments.
  • Check whether MaxPairwise/MinPairwise should also include forms for Vector128
  • Should the arguments for ReciprocalStep and ReciprocalSquareRootStep be something more specific than left and right?
  • Looks like ReverseElementBytes needs more work, as well as clarity on support
  • We skipped Arm32 because it's not going to be implemented for .NET 5
  • ExtractVector shouldn't have overloads for float and double, it could end up silently modifying/normalizing/corrupting the floating point types
  • MaxNumericPairwiseScalar should be MaxNumberPairwiseScalar. Some folks raised concerns around PairwiseScalar being confusing, but it matches the ISA name and we can't think of a better name
  • We didn't review all the APIs (I commented the ones below). Tanner will see whether they are just applying a pattern or a net-new APIs, in which case we'll take another look.
namespace System.Runtime.Intrinsics.Arm
{
    public partial class AdvSimd
    {
        /// <summary>
        /// Vector CompareGreaterThanOrEqual
        /// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
        /// Corresponds to vector forms of ARM64 FACGE
        /// </summary>
        public static Vector64<float>   AbsoluteCompareGreaterThanOrEqual(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  AbsoluteCompareGreaterThanOrEqual(Vector128<float>  left, Vector128<float>  right);

        /// <summary>
        /// Vector CompareGreaterThan
        ///
        /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
        ///
        /// Corresponds to vector forms of ARM64 FACGT
        /// </summary>
        public static Vector64<float>   AbsoluteCompareGreaterThan(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  AbsoluteCompareGreaterThan(Vector128<float>  left, Vector128<float>  right);

        /// <summary>
        /// Vector absolute difference
        /// Corresponds to vector forms of ARM64 SABD, UABD, and FABD
        /// </summary>
        public static Vector64<byte>    AbsoluteDifference(Vector64<byte>    left, Vector64<byte>    right);
        public static Vector64<byte>    AbsoluteDifference(Vector64<sbyte>   left, Vector64<sbyte>   right);
        public static Vector64<ushort>  AbsoluteDifference(Vector64<ushort>  left, Vector64<ushort>  right);
        public static Vector64<ushort>  AbsoluteDifference(Vector64<short>   left, Vector64<short>   right);
        public static Vector64<uint>    AbsoluteDifference(Vector64<uint>    left, Vector64<uint>    right);
        public static Vector64<uint>    AbsoluteDifference(Vector64<int>     left, Vector64<int>     right);
        public static Vector64<float>   AbsoluteDifference(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<byte>   AbsoluteDifference(Vector128<byte>   left, Vector128<byte>   right);
        public static Vector128<byte>   AbsoluteDifference(Vector128<sbyte>  left, Vector128<sbyte>  right);
        public static Vector128<ushort> AbsoluteDifference(Vector128<ushort> left, Vector128<ushort> right);
        public static Vector128<ushort> AbsoluteDifference(Vector128<short>  left, Vector128<short>  right);
        public static Vector128<uint>   AbsoluteDifference(Vector128<uint>   left, Vector128<uint>   right);
        public static Vector128<uint>   AbsoluteDifference(Vector128<int>    left, Vector128<int>    right);
        public static Vector128<float>  AbsoluteDifference(Vector128<float>  left, Vector128<float>  right);

        /// <summary>
        /// Vector absolute difference add
        ///
        /// For each element result[elem] = acc[elem] + | left[elem] - right[elem] |
        ///
        /// Corresponds to vector forms of ARM64 SABA, UABA
        /// </summary>
        public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<byte>    addend, Vector64<byte>    left, Vector64<byte>    right);
        public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<sbyte>   addend, Vector64<sbyte>   left, Vector64<sbyte>   right);
        public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<ushort>  addend, Vector64<ushort>  left, Vector64<ushort>  right);
        public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<short>   addend, Vector64<short>   left, Vector64<short>   right);
        public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<uint>    addend, Vector64<uint>    left, Vector64<uint>    right);
        public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<int>     addend, Vector64<int>     left, Vector64<int>     right);
        public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<byte>   addend, Vector128<byte>   left, Vector128<byte>   right);
        public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<sbyte>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right);
        public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right);
        public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<short>  addend, Vector128<short>  left, Vector128<short>  right);
        public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<uint>   addend, Vector128<uint>   left, Vector128<uint>   right);
        public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<int>    addend, Vector128<int>    left, Vector128<int>    right);

        /// <summary>
        /// Vector add pairwise
        /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
        /// Corresponds to vector forms of ARM64 ADDP, and FADDP
        /// </summary>
        public static Vector64<byte>   AddPairwise(Vector64<byte>  left, Vector64<byte>  right) ;
        public static Vector64<sbyte>  AddPairwise(Vector64<sbyte>  left, Vector64<sbyte>  right) ;
        public static Vector64<ushort> AddPairwise(Vector64<ushort>  left, Vector64<ushort>  right) ;
        public static Vector64<short>  AddPairwise(Vector64<short>  left, Vector64<short>  right) ;
        public static Vector64<int>    AddPairwise(Vector64<int>  left, Vector64<int>  right) ;
        public static Vector64<uint>   AddPairwise(Vector64<uint>  left, Vector64<uint>  right) ;
        public static Vector64<float>  AddPairwise(Vector64<float>  left, Vector64<float>  right) ;

        /// <summary>
        /// Vector extract from pair of vectors
        /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
        ///
        /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
        ///
        /// Corresponds to vector forms of ARM64 EXT
        /// </summary>
        public static Vector64<byte>   ExtractVector64(Vector64<byte>  upper, Vector64<byte>  lower, byte byteIndex);
        public static Vector64<sbyte>  ExtractVector64(Vector64<sbyte>  upper, Vector64<sbyte>  lower, byte byteIndex);
        public static Vector64<short>  ExtractVector64(Vector64<short>  upper, Vector64<short>  lower, byte byteIndex);
        public static Vector64<ushort> ExtractVector64(Vector64<ushort>  upper, Vector64<ushort>  lower, byte byteIndex);
        public static Vector64<int>    ExtractVector64(Vector64<int>  upper, Vector64<int>  lower, byte byteIndex);
        public static Vector64<uint>   ExtractVector64(Vector64<uint>  upper, Vector64<uint>  lower, byte byteIndex);

        public static Vector128<byte>   ExtractVector128(Vector128<byte> upper, Vector128<byte> lower, byte byteIndex);
        public static Vector128<sbyte>  ExtractVector128(Vector128<sbyte> upper, Vector128<sbyte> lower, byte byteIndex);
        public static Vector128<short>  ExtractVector128(Vector128<short> upper, Vector128<short> lower, byte byteIndex);
        public static Vector128<ushort> ExtractVector128(Vector128<ushort> upper, Vector128<ushort> lower, byte byteIndex);
        public static Vector128<int>    ExtractVector128(Vector128<int> upper, Vector128<int> lower, byte byteIndex);
        public static Vector128<uint>   ExtractVector128(Vector128<uint> upper, Vector128<uint> lower, byte byteIndex);
        public static Vector128<long>   ExtractVector128(Vector128<long> upper, Vector128<long> lower, byte byteIndex);
        public static Vector128<ulong>  ExtractVector128(Vector128<ulong> upper, Vector128<ulong> lower, byte byteIndex);
        public static Vector128<float>  ExtractVector128(Vector128<float> upper, Vector128<float> lower, byte byteIndex);
        public static Vector128<double> ExtractVector128(Vector128<double> upper, Vector128<double> lower, byte byteIndex);

        /// <summary>
        /// Vector max numeric
        /// Corresponds to vector forms of ARM64 FMAXNM
        /// </summary>
        public static Vector64<float>   MaxNumber(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  MaxNumber(Vector128<float>  left, Vector128<float>  right);

        /// <summary>
        /// Vector max pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 SMAXP, UMAXP, and FMAXP
        /// </summary>
        public static Vector64<byte>    MaxPairwise(Vector64<byte>    left, Vector64<byte>    right);
        public static Vector64<sbyte>   MaxPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right);
        public static Vector64<ushort>  MaxPairwise(Vector64<ushort>  left, Vector64<ushort>  right);
        public static Vector64<short>   MaxPairwise(Vector64<short>   left, Vector64<short>   right);
        public static Vector64<uint>    MaxPairwise(Vector64<uint>    left, Vector64<uint>    right);
        public static Vector64<int>     MaxPairwise(Vector64<int>     left, Vector64<int>     right);
        public static Vector64<float>   MaxPairwise(Vector64<float>   left, Vector64<float>   right);

        /// <summary>
        /// Vector min numeric
        /// Corresponds to vector forms of ARM64 FMINNM
        /// </summary>
        public static Vector64<float>   MinNumber(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  MinNumber(Vector128<float>  left, Vector128<float>  right);

        /// <summary>
        /// Vector min pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 SMINP, UMINP, and FMINP
        /// </summary>
        public static Vector64<byte>    MinPairwise(Vector64<byte>    left, Vector64<byte>    right);
        public static Vector64<sbyte>   MinPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right);
        public static Vector64<ushort>  MinPairwise(Vector64<ushort>  left, Vector64<ushort>  right);
        public static Vector64<short>   MinPairwise(Vector64<short>   left, Vector64<short>   right);
        public static Vector64<uint>    MinPairwise(Vector64<uint>    left, Vector64<uint>    right);
        public static Vector64<int>     MinPairwise(Vector64<int>     left, Vector64<int>     right);
        public static Vector64<float>   MinPairwise(Vector64<float>   left, Vector64<float>   right);

        /// <summary>
        /// Vector multiply add
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 MLA
        /// </summary>
        public static Vector64<byte>    MultiplyAdd(Vector64<byte>    addend, Vector64<byte>    left, Vector64<byte>    right);
        public static Vector64<sbyte>   MultiplyAdd(Vector64<sbyte>   addend, Vector64<sbyte>   left, Vector64<sbyte>   right);
        public static Vector64<ushort>  MultiplyAdd(Vector64<ushort>  addend, Vector64<ushort>  left, Vector64<ushort>  right);
        public static Vector64<short>   MultiplyAdd(Vector64<short>   addend, Vector64<short>   left, Vector64<short>   right);
        public static Vector64<uint>    MultiplyAdd(Vector64<uint>    addend, Vector64<uint>    left, Vector64<uint>    right);
        public static Vector64<int>     MultiplyAdd(Vector64<int>     addend, Vector64<int>     left, Vector64<int>     right);
        public static Vector128<byte>   MultiplyAdd(Vector128<byte>   addend, Vector128<byte>   left, Vector128<byte>   right);
        public static Vector128<sbyte>  MultiplyAdd(Vector128<sbyte>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right);
        public static Vector128<ushort> MultiplyAdd(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right);
        public static Vector128<short>  MultiplyAdd(Vector128<short>  addend, Vector128<short>  left, Vector128<short>  right);
        public static Vector128<uint>   MultiplyAdd(Vector128<uint>   addend, Vector128<uint>   left, Vector128<uint>   right);
        public static Vector128<int>    MultiplyAdd(Vector128<int>    addend, Vector128<int>    left, Vector128<int>    right);

        /// <summary>
        /// Vector multiply add by element
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 MLA
        /// </summary>
        public static Vector64<byte>    MultiplyAddBySelectedScalar(Vector64<byte>    addend, Vector64<byte>    left, Vector64<byte>    right, byte rightIndex);
        public static Vector64<sbyte>   MultiplyAddBySelectedScalar(Vector64<sbyte>   addend, Vector64<sbyte>   left, Vector64<sbyte>   right, byte rightIndex);
        public static Vector64<ushort>  MultiplyAddBySelectedScalar(Vector64<ushort>  addend, Vector64<ushort>  left, Vector64<ushort>  right, byte rightIndex);
        public static Vector64<short>   MultiplyAddBySelectedScalar(Vector64<short>   addend, Vector64<short>   left, Vector64<short>   right, byte rightIndex);
        public static Vector64<uint>    MultiplyAddBySelectedScalar(Vector64<uint>    addend, Vector64<uint>    left, Vector64<uint>    right, byte rightIndex);
        public static Vector64<int>     MultiplyAddBySelectedScalar(Vector64<int>     addend, Vector64<int>     left, Vector64<int>     right, byte rightIndex);
        public static Vector128<byte>   MultiplyAddBySelectedScalar(Vector128<byte>   addend, Vector128<byte>   left, Vector128<byte>   right, byte rightIndex);
        public static Vector128<sbyte>  MultiplyAddBySelectedScalar(Vector128<sbyte>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right, byte rightIndex);
        public static Vector128<ushort> MultiplyAddBySelectedScalar(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
        public static Vector128<short>  MultiplyAddBySelectedScalar(Vector128<short>  addend, Vector128<short>  left, Vector128<short>  right, byte rightIndex);
        public static Vector128<uint>   MultiplyAddBySelectedScalar(Vector128<uint>   addend, Vector128<uint>   left, Vector128<uint>   right, byte rightIndex);
        public static Vector128<int>    MultiplyAddBySelectedScalar(Vector128<int>    addend, Vector128<int>    left, Vector128<int>    right, byte rightIndex);

        /// <summary>
        /// Vector multiply subtract
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 MLS
        /// </summary>
        public static Vector64<byte>    MultiplySubtract(Vector64<byte>    minuend, Vector64<byte>    left, Vector64<byte>    right);
        public static Vector64<sbyte>   MultiplySubtract(Vector64<sbyte>   minuend, Vector64<sbyte>   left, Vector64<sbyte>   right);
        public static Vector64<ushort>  MultiplySubtract(Vector64<ushort>  minuend, Vector64<ushort>  left, Vector64<ushort>  right);
        public static Vector64<short>   MultiplySubtract(Vector64<short>   minuend, Vector64<short>   left, Vector64<short>   right);
        public static Vector64<uint>    MultiplySubtract(Vector64<uint>    minuend, Vector64<uint>    left, Vector64<uint>    right);
        public static Vector64<int>     MultiplySubtract(Vector64<int>     minuend, Vector64<int>     left, Vector64<int>     right);
        public static Vector128<byte>   MultiplySubtract(Vector128<byte>   minuend, Vector128<byte>   left, Vector128<byte>   right);
        public static Vector128<sbyte>  MultiplySubtract(Vector128<sbyte>  minuend, Vector128<sbyte>  left, Vector128<sbyte>  right);
        public static Vector128<ushort> MultiplySubtract(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right);
        public static Vector128<short>  MultiplySubtract(Vector128<short>  minuend, Vector128<short>  left, Vector128<short>  right);
        public static Vector128<uint>   MultiplySubtract(Vector128<uint>   minuend, Vector128<uint>   left, Vector128<uint>   right);
        public static Vector128<int>    MultiplySubtract(Vector128<int>    minuend, Vector128<int>    left, Vector128<int>    right);

        /// <summary>
        /// Vector multiply subtract by element
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 MLS
        /// </summary>
        public static Vector64<byte>    MultiplySubtractBySelectedScalar(Vector64<byte>    minuend, Vector64<byte>    left, Vector64<byte>    right, byte rightIndex);
        public static Vector64<sbyte>   MultiplySubtractBySelectedScalar(Vector64<sbyte>   minuend, Vector64<sbyte>   left, Vector64<sbyte>   right, byte rightIndex);
        public static Vector64<ushort>  MultiplySubtractBySelectedScalar(Vector64<ushort>  minuend, Vector64<ushort>  left, Vector64<ushort>  right, byte rightIndex);
        public static Vector64<short>   MultiplySubtractBySelectedScalar(Vector64<short>   minuend, Vector64<short>   left, Vector64<short>   right, byte rightIndex);
        public static Vector64<uint>    MultiplySubtractBySelectedScalar(Vector64<uint>    minuend, Vector64<uint>    left, Vector64<uint>    right, byte rightIndex);
        public static Vector64<int>     MultiplySubtractBySelectedScalar(Vector64<int>     minuend, Vector64<int>     left, Vector64<int>     right, byte rightIndex);
        public static Vector128<byte>   MultiplySubtractBySelectedScalar(Vector128<byte>   minuend, Vector128<byte>   left, Vector128<byte>   right, byte rightIndex);
        public static Vector128<sbyte>  MultiplySubtractBySelectedScalar(Vector128<sbyte>  minuend, Vector128<sbyte>  left, Vector128<sbyte>  right, byte rightIndex);
        public static Vector128<ushort> MultiplySubtractBySelectedScalar(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
        public static Vector128<short>  MultiplySubtractBySelectedScalar(Vector128<short>  minuend, Vector128<short>  left, Vector128<short>  right, byte rightIndex);
        public static Vector128<uint>   MultiplySubtractBySelectedScalar(Vector128<uint>   minuend, Vector128<uint>   left, Vector128<uint>   right, byte rightIndex);
        public static Vector128<int>    MultiplySubtractBySelectedScalar(Vector128<int>    minuend, Vector128<int>    left, Vector128<int>    right, byte rightIndex);

        /// <summary>
        /// Vector fused multiply add
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 FMLA
        /// </summary>
        public static Vector64<float>   FusedMultiplyAdd(Vector64<float>   addend, Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  addend, Vector128<float>  left, Vector128<float>  right);

        /// <summary>
        /// Vector fused multiply subtract
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 FMLS
        /// </summary>
        public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   minuend, Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  minuend, Vector128<float>  left, Vector128<float>  right);


        /// <summary>
        /// Vector polynomial multiply
        /// Corresponds to vector forms of ARM64 PMUL
        /// </summary>
        public static Vector64<byte>    PolynomialMultiply(Vector64<byte>    left, Vector64<byte>    right);
        public static Vector64<sbyte>   PolynomialMultiply(Vector64<sbyte>   left, Vector64<sbyte>   right);
        public static Vector128<byte>   PolynomialMultiply(Vector128<byte>   left, Vector128<byte>   right);
        public static Vector128<sbyte>  PolynomialMultiply(Vector128<sbyte>  left, Vector128<sbyte>  right);

        /// Vector reciprocal estimate
        ///
        /// See FRECPE docs
        ///
        /// Corresponds to vector forms of ARM64 FRECPE
        /// </summary>
        public static Vector64<float>   ReciprocalEstimate(Vector64<float>   value);
        public static Vector128<float>  ReciprocalEstimate(Vector128<float>  value);

        /// <summary>
        /// Vector reciprocal step
        ///
        /// See FRECPS docs
        ///
        /// Corresponds to vector forms of ARM64 FRECPS
        /// </summary>
        public static Vector64<float>   ReciprocalStep(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  ReciprocalStep(Vector128<float>  left, Vector128<float>  right);

        /// <summary>
        /// Vector reciprocal square root estimate
        ///
        /// See FRSQRTE docs
        ///
        /// Corresponds to vector forms of ARM64 FRSQRTE
        /// </summary>
        public static Vector64<float>   ReciprocalSquareRootEstimate(Vector64<float>   value);
        public static Vector128<float>  ReciprocalSquareRootEstimate(Vector128<float>  value);

        /// <summary>
        /// Vector reciprocal square root step
        ///
        /// See FRSQRTS docs
        ///
        /// Corresponds to vector forms of ARM64 FRSQRTS
        /// </summary>
        public static Vector64<float>   ReciprocalSquareRootStep(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  ReciprocalSquareRootStep(Vector128<float>  left, Vector128<float>  right);

        public partial class Arm64
        {
            /// <summary>
            /// Vector CompareGreaterThanOrEqual
            /// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
            /// Corresponds to vector forms of ARM64 FACGE
            /// </summary>
            public static Vector128<double> AbsoluteCompareGreaterThanOrEqual(Vector128<double> left, Vector128<double> right);

            /// <summary>
            /// Vector CompareGreaterThan
            ///
            /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
            ///
            /// Corresponds to vector forms of ARM64 FACGT
            /// </summary>
            public static Vector128<double> AbsoluteCompareGreaterThan(Vector128<double> left, Vector128<double> right);

            /// <summary>
            /// Vector absolute difference
            /// Corresponds to vector forms of ARM64 SABD, UABD, and FABD
            /// </summary>
            public static Vector128<double> AbsoluteDifference(Vector128<double> left, Vector128<double> right);

            /// <summary>
            /// Vector add pairwise
            /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
            /// Corresponds to vector forms of ARM64 ADDP, and FADDP
            /// </summary>
            public static Vector128<byte>   AddPairwise(Vector128<byte>   left, Vector128<byte>   right);
            public static Vector128<sbyte>  AddPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right);
            public static Vector128<ushort> AddPairwise(Vector128<ushort> left, Vector128<ushort> right);
            public static Vector128<short>  AddPairwise(Vector128<short>  left, Vector128<short>  right);
            public static Vector128<uint>   AddPairwise(Vector128<uint>   left, Vector128<uint>   right);
            public static Vector128<int>    AddPairwise(Vector128<int>    left, Vector128<int>    right);
            public static Vector128<long>   AddPairwise(Vector128<long>   left, Vector128<long>   right);
            public static Vector128<ulong>  AddPairwise(Vector128<ulong>  left, Vector128<ulong>  right);
            public static Vector128<float>  AddPairwise(Vector128<float>  left, Vector128<float>  right);
            public static Vector128<double> AddPairwise(Vector128<double> left, Vector128<double> right);

            /// <summary>
            /// Vector extract from pair of vectors
            /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
            ///
            /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
            ///
            /// Corresponds to vector forms of ARM64 EXT
            /// </summary>
            public static Vector128<double> ExtractVector(Vector128<double> left, Vector128<double> right, byte index);

            /// <summary>
            /// Vector add across vector elements
            /// Corresponds to vector forms of ARM64 ADDV
            /// </summary>
            public static Vector64<byte>   AddAcross(Vector64<byte>    value);
            public static Vector64<sbyte>  AddAcross(Vector64<sbyte>   value);
            public static Vector64<ushort> AddAcross(Vector64<ushort>  value);
            public static Vector64<short>  AddAcross(Vector64<short>   value);
            public static Vector64<uint>   AddAcross(Vector64<uint>    value);
            public static Vector64<int>    AddAcross(Vector64<int>     value);
            public static Vector64<byte>   AddAcross(Vector128<byte>   value);
            public static Vector64<sbyte>  AddAcross(Vector128<sbyte>  value);
            public static Vector64<ushort> AddAcross(Vector128<ushort> value);
            public static Vector64<short>  AddAcross(Vector128<short>  value);
            public static Vector64<uint>   AddAcross(Vector128<uint>   value);
            public static Vector64<int>    AddAcross(Vector128<int>    value);

            /// <summary>
            /// Vector max numeric
            /// Corresponds to vector forms of ARM64 FMAXNM
            /// </summary>
            public static Vector128<double> MaxNumber(Vector128<double> left, Vector128<double> right);

            /// <summary>
            /// Vector max numeric pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 FMAXNMP
            /// </summary>
            public static Vector64<float>   MaxNumberPairwise(Vector64<float>   left, Vector64<float>   right);
            public static Vector128<float>  MaxNumberPairwise(Vector128<float>  left, Vector128<float>  right);
            public static Vector128<double> MaxNumberPairwise(Vector128<double> left, Vector128<double> right);

            /// <summary>
            /// Vector max numeric across
            ///
            /// result = max(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 FMAXNMV
            /// </summary>
            public static Vector64<float> MaxNumberAcross(Vector128<float>  value);

            /// <summary>
            /// Vector max pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 SMAXP, UMAXP, and FMAXP
            /// </summary>
            public static Vector128<byte>   MaxPairwise(Vector128<byte>   left, Vector128<byte>   right);
            public static Vector128<sbyte>  MaxPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right);
            public static Vector128<ushort> MaxPairwise(Vector128<ushort> left, Vector128<ushort> right);
            public static Vector128<short>  MaxPairwise(Vector128<short>  left, Vector128<short>  right);
            public static Vector128<uint>   MaxPairwise(Vector128<uint>   left, Vector128<uint>   right);
            public static Vector128<int>    MaxPairwise(Vector128<int>    left, Vector128<int>    right);
            public static Vector128<float>  MaxPairwise(Vector128<float>  left, Vector128<float>  right);
            public static Vector128<double> MaxPairwise(Vector128<double> left, Vector128<double> right);

            /// <summary>
            /// Vector max across
            ///
            /// result = max(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 SMAXV, UMAXV, and FMAXV
            /// </summary>
            public static Vector64<byte>   MaxAcross(Vector64<byte>    value);
            public static Vector64<sbyte>  MaxAcross(Vector64<sbyte>   value);
            public static Vector64<ushort> MaxAcross(Vector64<ushort>  value);
            public static Vector64<short>  MaxAcross(Vector64<short>   value);
            public static Vector64<uint>   MaxAcross(Vector64<uint>    value);
            public static Vector64<int>    MaxAcross(Vector64<int>     value);
            public static Vector64<float>  MaxAcross(Vector64<float>   value);
            public static Vector64<byte>   MaxAcross(Vector128<byte>   value);
            public static Vector64<sbyte>  MaxAcross(Vector128<sbyte>  value);
            public static Vector64<ushort> MaxAcross(Vector128<ushort> value);
            public static Vector64<short>  MaxAcross(Vector128<short>  value);
            public static Vector64<uint>   MaxAcross(Vector128<uint>   value);
            public static Vector64<int>    MaxAcross(Vector128<int>    value);
            public static Vector64<ulong>  MaxAcross(Vector128<ulong>  value);
            public static Vector64<long>   MaxAcross(Vector128<long>   value);
            public static Vector64<float>  MaxAcross(Vector128<float>  value);
            public static Vector64<double> MaxAcross(Vector128<double> value);

           // -------------------------------------------------
           // Reviewed today:
           // -------------------------------------------------


           /// <summary>
           /// Vector min numeric
           /// Corresponds to vector forms of ARM64 FMINNM
           /// </summary>
           public static Vector128<double> MinNumber(Vector128<double> left, Vector128<double> right);

           /// <summary>
           /// Vector min numeric pairwise
           ///
           /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
           ///
           /// Corresponds to vector forms of ARM64 FMINNMP
           /// </summary>
           public static Vector64<float>   MinNumberPairwise(Vector64<float>   left, Vector64<float>   right);
           public static Vector128<float>  MinNumberPairwise(Vector128<float>  left, Vector128<float>  right);
           public static Vector128<double> MinNumberPairwise(Vector128<double> left, Vector128<double> right);

           /// <summary>
           /// Vector min numeric across
           ///
           /// result = min(value[0], ... , value[length -1])
           ///
           /// Corresponds to vector forms of ARM64 FMINNMV
           /// </summary>
           public static float  MinNumberAcross(Vector128<float>  value);

           /// <summary>
           /// Vector min pairwise
           ///
           /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
           ///
           /// Corresponds to vector forms of ARM64 SMINP, UMINP, and FMINP
           /// </summary>
           public static Vector128<byte>   MinPairwise(Vector128<byte>   left, Vector128<byte>   right);
           public static Vector128<sbyte>  MinPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right);
           public static Vector128<ushort> MinPairwise(Vector128<ushort> left, Vector128<ushort> right);
           public static Vector128<short>  MinPairwise(Vector128<short>  left, Vector128<short>  right);
           public static Vector128<uint>   MinPairwise(Vector128<uint>   left, Vector128<uint>   right);
           public static Vector128<int>    MinPairwise(Vector128<int>    left, Vector128<int>    right);
           public static Vector128<float>  MinPairwise(Vector128<float>  left, Vector128<float>  right);
           public static Vector128<double> MinPairwise(Vector128<double> left, Vector128<double> right);

           /// <summary>
           /// Vector min across
           ///
           /// result = max(value[0], ... , value[length -1])
           ///
           /// Corresponds to vector forms of ARM64 SMINV, UMINV, and FMINV
           /// </summary>
           public static Vector64<byte>   MinAcross(Vector64<byte>    value);
           public static Vector64<sbyte>  MinAcross(Vector64<sbyte>   value);
           public static Vector64<ushort> MinAcross(Vector64<ushort>  value);
           public static Vector64<short>  MinAcross(Vector64<short>   value);
           public static Vector64<uint>   MinAcross(Vector64<uint>    value);
           public static Vector64<int>    MinAcross(Vector64<int>     value);
           public static Vector64<float>  MinAcross(Vector64<float>   value);
           public static Vector64<byte>   MinAcross(Vector128<byte>   value);
           public static Vector64<sbyte>  MinAcross(Vector128<sbyte>  value);
           public static Vector64<ushort> MinAcross(Vector128<ushort> value);
           public static Vector64<short>  MinAcross(Vector128<short>  value);
           public static Vector64<uint>   MinAcross(Vector128<uint>   value);
           public static Vector64<int>    MinAcross(Vector128<int>    value);
           public static Vector64<float>  MinAcross(Vector128<float>  value);
           public static Vector64<double> MinAcross(Vector128<double> value);

           /// <summary>
           /// Vector fused multiply add
           ///
           /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
           ///
           /// Corresponds to vector forms of ARM64 FMLA
           /// </summary>
           public static Vector128<double> FusedMultiplyAdd(Vector128<double> addend, Vector128<double> left, Vector128<double> right);

           /// <summary>
           /// Vector fused multiply add by element
           ///
           /// For each element result[elem] = acc[elem] + left[elem] * right
           ///
           /// Corresponds to vector forms of ARM64 FMLA
           /// </summary>
           public static Vector64<float>   FusedMultiplyAdd(Vector64<float>  addend, Vector64<float>  left, float right);
           public static Vector128<float>  FusedMultiplyAdd(Vector128<float> addend, Vector128<float> left, float right);
           public static Vector128<float>  FusedMultiplyAdd(Vector128<float> addend, Vector128<float> left, float right);

           /// <summary>
           /// Vector fused multiply subtract
           ///
           /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
           ///
           /// Corresponds to vector forms of ARM64 FMLS
           /// </summary>
           public static Vector128<double>  FusedMultiplySubtract(Vector128<double> minuend, Vector128<double>  left, Vector128<double> right);

           /// <summary>
           /// Vector fused multiply subtract by element
           ///
           /// For each element result[elem] = acc[elem] - left[elem] * right
           ///
           /// Corresponds to vector forms of ARM64 FMLS
           /// </summary>
           public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   minuend, Vector64<float>   left, float right);
           public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  minuend, Vector128<float>  left, float right);
           public static Vector128<double> FusedMultiplySubtract(Vector128<double> minuend, Vector128<double> left, float right);

           /// <summary>
           /// Vector multiply extend
           ///
           /// For each element result[elem] = left[elem] * right[elem]
           /// Handle extend special cases zero and infinite.  FMULX
           ///
           /// Corresponds to vector forms of ARM64 FMULX
           /// </summary>
           public static Vector64<float>   MultiplyExtended(Vector64<float>   left, Vector64<float>   right);
           public static Vector128<float>  MultiplyExtended(Vector128<float>  left, Vector128<float>  right);
           public static Vector128<double> MultiplyExtended(Vector128<double> left, Vector128<double> right);

           /// <summary>
           /// Vector multiply extend by element
           ///
           /// For each element result[elem] = left[elem] * right
           /// Handle extend special cases zero and infinite.  FMULX
           ///
           /// Corresponds to vector forms of ARM64 FMULX
           /// </summary>
            public static Vector64<float>   MultiplyExtendedBySelectedScalar(Vector64<float>   left, Vector64<float>   right, byte rightIndex);
            public static Vector128<float>  MultiplyExtendedBySelectedScalar(Vector128<float>  left, Vector128<float>  right, byte rightIndex);
            public static Vector128<double> MultiplyExtendedBySelectedScalar(Vector128<double> left, Vector128<double> right, byte rightIndex);

           /// Vector reciprocal estimate
           ///
           /// See FRECPE docs
           ///
           /// Corresponds to vector forms of ARM64 FRECPE
           /// </summary>
           public static Vector128<double> ReciprocalEstimate(Vector128<double> value);

           /// <summary>
           /// Vector reciprocal step
           ///
           /// See FRECPS docs
           ///
           /// Corresponds to vector forms of ARM64 FRECPS
           /// </summary>
           public static Vector128<double> ReciprocalStep(Vector128<double> left, Vector128<double> right);

           /// <summary>
           /// Vector reciprocal square root estimate
           ///
           /// See FRSQRTE docs
           ///
           /// Corresponds to vector forms of ARM64 FRSQRTE
           /// </summary>
           public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> value);

           /// <summary>
           /// Vector reciprocal square root step
           ///
           /// See FRSQRTS docs
           ///
           /// Corresponds to vector forms of ARM64 FRSQRTS
           /// </summary>
           public static Vector128<double> ReciprocalSquareRootStep(Vector128<double> left, Vector128<double> right);

           /// <summary>
           /// Vector reverse byte bits
           /// Corresponds to vector forms of ARM64 RBIT
           /// </summary>
           public static Vector64<byte>    ReverseElementBits(Vector64<byte>    value);
           public static Vector64<sbyte>   ReverseElementBits(Vector64<sbyte>   value);
           public static Vector128<byte>   ReverseElementBits(Vector128<byte>   value);
           public static Vector128<sbyte>  ReverseElementBits(Vector128<sbyte>  value);
        }
    }
}

@echesakov
Copy link
Contributor

echesakov commented Mar 13, 2020

I think we should re-consider API design for ExtractVector64 and ExtractVector128 - instead of

  1. forbidding floating point types and
  2. specifying a byteIndex

we should follow the approach that C++ intrinsics take - specify elementIndex and have JIT to convert this element index to byte index immediate - this way we are not gonna get de-normalized floating point value as a result and we will be on parity with the C++ implementations.

For example, for ExtractVector64(upper, lower, 1) where upper and lower are Vector64<float> JIT will emit EXT Vd.8B, Vn.8B, Vm.8B, 4

@tannergooding
Copy link
Member

I don't like the approach of taking an elementIndex. That artificially limits the usage of the intrinsics and prevents you from extracting an arbitrary 64-bit sequence.

There are separate intrinsics for extracting individual elements from a vector: #24588 and users wanting to work with float can just use the zero cost reinterpret cast APIs (.AsInt32 and .AsSingle), which will force them to rationalize the denormal scenario and take it into consideration.

@echesakov
Copy link
Contributor

Another question concerning FusedMultiplyAddBySelectedScalar and FusedMultiplySubtractBySelectedScalar.

In C++ there are exist
float32x2_t vfma_lane_f32 (float32x2_t a, float32x2_t b, float32x2_t v, const int lane)
and
float32x2_t vfma_laneq_f32 (float32x2_t a, float32x2_t b, float32x4_t v, const int lane).

Shouldn't we follow the same approach, i.e. have multiple overloads such as :

public static Vector64<float> FusedMultiplyAddBySelectedScalar(Vector64<float> addend, Vector64<float> left, Vector64<float> right, byte rightIndex);

public static Vector64<float> FusedMultiplyAddBySelectedScalar(Vector64<float> addend, Vector64<float> left, Vector128<float> right, byte rightIndex);

Alternatively, we can can have right to be Vector128<T> no matter size of addend and left and upcast Vector64<T> to Vector128<T> if needed

The current design, however, when we match the sizes of left,right and addend doesn't seem to be practical.

@echesakov
Copy link
Contributor

echesakov commented Mar 13, 2020

I don't like the approach of taking an elementIndex. That artificially limits the usage of the intrinsics and prevents you from extracting an arbitrary 64-bit sequence.

There are separate intrinsics for extracting individual elements from a vector: #24588 and users wanting to work with float can just use the zero cost reinterpret cast APIs (.AsInt32 and .AsSingle), which will force them to rationalize the denormal scenario and take it into consideration.

#24588 works only on one SIMD register not a pair of SIMD registers.

If a user wants to extract an arbitrary 8/16-bytes sequence why not convert both operands to Vector64/128 using As<byte>()?

@tannergooding
Copy link
Member

Shouldn't we follow the same approach, i.e. have multiple overloads such as
The current design, however, when we match the sizes of left,right and addend doesn't seem to be practical.

It doesn't look like the underlying instruction encoding requires they all be the same size and so I would guess this is done because C++ doesn't have an easy way to convert from V128<T> to V64<T>.
Maybe @TamarChristinaArm has a better idea of why the split exists?

If a user wants to extract an arbitrary 8/16-bytes sequence why not convert both operands to Vector64/128 using As()?

They could also do that, but the underlying instruction actually operates on byteIndex and we have tended away from adding abstractions of the instructions so far.

@echesakov
Copy link
Contributor

They could also do that, but the underlying instruction actually operates on byteIndex and we have tended away from adding abstractions of the instructions so far.

Well, this is true on Arm64 for EXT <Vd>.<T>, <Vn>.<T>, <Vm>.<T>, #<index> where
<index> is indeed a byte index. By the way, <T> can only be 8B or 16B that kind of suggests you are working on byte sequences.

However, on Arm32 VEXT (multibyte elements) VEXT.<size> {<Dd>,} <Dn>, <Dm>, #<imm> is a pseudo-instruction that translates by assembler to VEXT (byte elements) VEXT.8 {<Dd>,} <Dn>, <Dm>, #<imm*(size/8)>, i.e. #<imm> is an element index.

@tannergooding
Copy link
Member

Both instructions (VEXT on ARM32 and EXT on ARM64) operate identically. They are similar to orr or other logical operations. That is, the instruction encoding only takes 8B/16B but it isn't doing something that is logically byte only and will be frequently used for non byte operations.

We can always create an issue and re-discuss ExtractVector64/ExtractVector128 again on Tuesday, bringing up the C++ difference and whether having it operate on element by default is better (with requesting users to downcast if they want byte sequences instead).

@TamarChristinaArm
Copy link
Contributor

TamarChristinaArm commented Mar 16, 2020

Shouldn't we follow the same approach, i.e. have multiple overloads such as
The current design, however, when we match the sizes of left,right and addend doesn't seem to be practical.

It doesn't look like the underlying instruction encoding requires they all be the same size and so I would guess this is done because C++ doesn't have an easy way to convert from V128<T> to V64<T>.
Maybe @TamarChristinaArm has a better idea of why the split exists?

For such intrinsics the split is always 4 ways. the location of the q in the name denotes which components are 128 bits.

So for e.g. the float case we have

vfma_lane_f32
vfmaq_lane_f32
vfma_laneq_f32
vfmaq_laneq_f32

precisely because as you said the instruction doesn't require them to all be the same size.

This convention (partially) holds for newer ISAs such as MVE[1] and SVE[2] as well, though in those cases we also have completely overload driven instances as well. e.g. svmla_lane for SVE will do the normal overloading you would expect in C++ (and in C using C11's _Generic extension).

We do have a way to convert from V128<T> to V64<T> i.e. vget_low but they're not zero cost abstractions. So we prefer to provide the overloads.

[1] https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
[2] https://static.docs.arm.com/100987/0000/acle_sve_100987_0000_00_en.pdf

echesakov added a commit that referenced this issue Mar 24, 2020
)

- Implements the following intrinsics as proposed in #24794: 

* AbsoluteDifferenceAdd

* MultiplyExtended, MultiplyExtendedScalar

* PolynomialMultiply

* ReciprocalEstimate, ReciprocalEstimateScalar

* ReciprocalExponentScalar

* ReciprocalSquareRootEstimate, ReciprocalSquareRootEstimateScalar

* ReciprocalSquareRootStep, ReciprocalSquareRootStepScalar

* ReciprocalStep, ReciprocalStepScalar

- Implements frecpe, frecps, frecpx, frsqrte, frsqrts, urecpe, ursqrte instructions

- Fixes implementation for fcmeq, fcmge, fcmgt, fcmle, fcmlt *(zero immediate)* instructions

- Adds missing flag BaseTypeFromFirstArg for AbsoluteDifference intrinsics
@tannergooding
Copy link
Member

The following APIs are still to be implemented:

namespace System.Runtime.Intrinsics.Arm
{
    public static class AdvSimd
    {
        /// <summary>
        /// Vector extract from pair of vectors
        /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
        ///
        /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
        ///
        /// Corresponds to vector forms of ARM64 EXT
        /// </summary>
        public static Vector64<byte>   ExtractVector64(Vector64<byte>  upper, Vector64<byte>  lower, byte byteIndex);
        public static Vector64<sbyte>  ExtractVector64(Vector64<sbyte>  upper, Vector64<sbyte>  lower, byte byteIndex);
        public static Vector64<short>  ExtractVector64(Vector64<short>  upper, Vector64<short>  lower, byte byteIndex);
        public static Vector64<ushort> ExtractVector64(Vector64<ushort>  upper, Vector64<ushort>  lower, byte byteIndex);
        public static Vector64<int>    ExtractVector64(Vector64<int>  upper, Vector64<int>  lower, byte byteIndex);
        public static Vector64<uint>   ExtractVector64(Vector64<uint>  upper, Vector64<uint>  lower, byte byteIndex);

        public static Vector128<byte>   ExtractVector128(Vector128<byte> upper, Vector128<byte> lower, byte byteIndex);
        public static Vector128<sbyte>  ExtractVector128(Vector128<sbyte> upper, Vector128<sbyte> lower, byte byteIndex);
        public static Vector128<short>  ExtractVector128(Vector128<short> upper, Vector128<short> lower, byte byteIndex);
        public static Vector128<ushort> ExtractVector128(Vector128<ushort> upper, Vector128<ushort> lower, byte byteIndex);
        public static Vector128<int>    ExtractVector128(Vector128<int> upper, Vector128<int> lower, byte byteIndex);
        public static Vector128<uint>   ExtractVector128(Vector128<uint> upper, Vector128<uint> lower, byte byteIndex);
        public static Vector128<long>   ExtractVector128(Vector128<long> upper, Vector128<long> lower, byte byteIndex);
        public static Vector128<ulong>  ExtractVector128(Vector128<ulong> upper, Vector128<ulong> lower, byte byteIndex);
        public static Vector128<float>  ExtractVector128(Vector128<float> upper, Vector128<float> lower, byte byteIndex);
        public static Vector128<double> ExtractVector128(Vector128<double> upper, Vector128<double> lower, byte byteIndex);

        /// <summary>
        /// Vector multiply add by element
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 MLA
        /// </summary>
        public static Vector64<byte>    MultiplyAddBySelectedScalar(Vector64<byte>    addend, Vector64<byte>    left, Vector64<byte>    right, byte rightIndex);
        public static Vector64<sbyte>   MultiplyAddBySelectedScalar(Vector64<sbyte>   addend, Vector64<sbyte>   left, Vector64<sbyte>   right, byte rightIndex);
        public static Vector64<ushort>  MultiplyAddBySelectedScalar(Vector64<ushort>  addend, Vector64<ushort>  left, Vector64<ushort>  right, byte rightIndex);
        public static Vector64<short>   MultiplyAddBySelectedScalar(Vector64<short>   addend, Vector64<short>   left, Vector64<short>   right, byte rightIndex);
        public static Vector64<uint>    MultiplyAddBySelectedScalar(Vector64<uint>    addend, Vector64<uint>    left, Vector64<uint>    right, byte rightIndex);
        public static Vector64<int>     MultiplyAddBySelectedScalar(Vector64<int>     addend, Vector64<int>     left, Vector64<int>     right, byte rightIndex);
        public static Vector128<byte>   MultiplyAddBySelectedScalar(Vector128<byte>   addend, Vector128<byte>   left, Vector128<byte>   right, byte rightIndex);
        public static Vector128<sbyte>  MultiplyAddBySelectedScalar(Vector128<sbyte>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right, byte rightIndex);
        public static Vector128<ushort> MultiplyAddBySelectedScalar(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
        public static Vector128<short>  MultiplyAddBySelectedScalar(Vector128<short>  addend, Vector128<short>  left, Vector128<short>  right, byte rightIndex);
        public static Vector128<uint>   MultiplyAddBySelectedScalar(Vector128<uint>   addend, Vector128<uint>   left, Vector128<uint>   right, byte rightIndex);
        public static Vector128<int>    MultiplyAddBySelectedScalar(Vector128<int>    addend, Vector128<int>    left, Vector128<int>    right, byte rightIndex);

        /// <summary>
        /// Vector multiply subtract by element
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 MLS
        /// </summary>
        public static Vector64<byte>    MultiplySubtractBySelectedScalar(Vector64<byte>    minuend, Vector64<byte>    left, Vector64<byte>    right, byte rightIndex);
        public static Vector64<sbyte>   MultiplySubtractBySelectedScalar(Vector64<sbyte>   minuend, Vector64<sbyte>   left, Vector64<sbyte>   right, byte rightIndex);
        public static Vector64<ushort>  MultiplySubtractBySelectedScalar(Vector64<ushort>  minuend, Vector64<ushort>  left, Vector64<ushort>  right, byte rightIndex);
        public static Vector64<short>   MultiplySubtractBySelectedScalar(Vector64<short>   minuend, Vector64<short>   left, Vector64<short>   right, byte rightIndex);
        public static Vector64<uint>    MultiplySubtractBySelectedScalar(Vector64<uint>    minuend, Vector64<uint>    left, Vector64<uint>    right, byte rightIndex);
        public static Vector64<int>     MultiplySubtractBySelectedScalar(Vector64<int>     minuend, Vector64<int>     left, Vector64<int>     right, byte rightIndex);
        public static Vector128<byte>   MultiplySubtractBySelectedScalar(Vector128<byte>   minuend, Vector128<byte>   left, Vector128<byte>   right, byte rightIndex);
        public static Vector128<sbyte>  MultiplySubtractBySelectedScalar(Vector128<sbyte>  minuend, Vector128<sbyte>  left, Vector128<sbyte>  right, byte rightIndex);
        public static Vector128<ushort> MultiplySubtractBySelectedScalar(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
        public static Vector128<short>  MultiplySubtractBySelectedScalar(Vector128<short>  minuend, Vector128<short>  left, Vector128<short>  right, byte rightIndex);
        public static Vector128<uint>   MultiplySubtractBySelectedScalar(Vector128<uint>   minuend, Vector128<uint>   left, Vector128<uint>   right, byte rightIndex);
        public static Vector128<int>    MultiplySubtractBySelectedScalar(Vector128<int>    minuend, Vector128<int>    left, Vector128<int>    right, byte rightIndex);

        public static class Arm64
        {
            /// <summary>
            /// Vector extract from pair of vectors
            /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
            ///
            /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
            ///
            /// Corresponds to vector forms of ARM64 EXT
            /// </summary>
            public static Vector128<double> ExtractVector128(Vector128<double> left, Vector128<double> right, byte index);

            /// <summary>
            /// Vector multiply extend by element
            ///
            /// For each element result[elem] = left[elem] * right
            /// Handle extend special cases zero and infinite.  FMULX
            ///
            /// Corresponds to vector forms of ARM64 FMULX
            /// </summary>
            public static Vector64<float>   MultiplyExtendedBySelectedScalar(Vector64<float>   left, Vector64<float>   right, byte rightIndex);
            public static Vector128<float>  MultiplyExtendedBySelectedScalar(Vector128<float>  left, Vector128<float>  right, byte rightIndex);
            public static Vector128<double> MultiplyExtendedBySelectedScalar(Vector128<double> left, Vector128<double> right, byte rightIndex);
        }
    }
}

The following APIs still need to be investigated and brought back for review (I will open a new issue for them):

namespace System.Runtime.Intrinsics.Arm
{
    public static class AdvSimd
    {
        /// <summary>
        /// Vector reverse element bytes
        /// Corresponds to vector forms of ARM64 REV16, REV32, REV64
        /// </summary>
        public static Vector64<ushort>  ReverseElementBytes(Vector64<ushort>  value) { throw null; }
        public static Vector64<short>   ReverseElementBytes(Vector64<short>   value) { throw null; }
        public static Vector64<uint>    ReverseElementBytes(Vector64<uint>    value) { throw null; }
        public static Vector64<int>     ReverseElementBytes(Vector64<int>     value) { throw null; }
        public static Vector64<float>   ReverseElementBytes(Vector64<float>   value) { throw null; }
        public static Vector128<ushort> ReverseElementBytes(Vector128<ushort> value) { throw null; }
        public static Vector128<short>  ReverseElementBytes(Vector128<short>  value) { throw null; }
        public static Vector128<uint>   ReverseElementBytes(Vector128<uint>   value) { throw null; }
        public static Vector128<int>    ReverseElementBytes(Vector128<int>    value) { throw null; }
        public static Vector128<ulong>  ReverseElementBytes(Vector128<ulong>  value) { throw null; }
        public static Vector128<long>   ReverseElementBytes(Vector128<long>   value) { throw null; }
        public static Vector128<float>  ReverseElementBytes(Vector128<float>  value) { throw null; }
    }
}

@echesakov
Copy link
Contributor

The following APIs are still to be implemented:

@tannergooding Also Fused_MLA/MLS_BySelectedScalar

And we need to add/propose MultiplyBySelectedScalar - mul also has by element form.

@tannergooding
Copy link
Member

Also Fused_MLA/MLS_BySelectedScalar
And we need to add/propose MultiplyBySelectedScalar - mul also has by element form.

The former haven't been proposed yet either (I don't see them listed anywhere above). I'm adding them to #33683

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
api-approved API was approved in API review, it can be implemented arch-arm64 area-System.Runtime.Intrinsics
Projects
None yet
Development

Successfully merging a pull request may close this issue.

10 participants