Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARM additional arithmetic intrinsics #32512

Closed
tannergooding opened this issue Feb 18, 2020 · 16 comments · Fixed by #35612
Closed

ARM additional arithmetic intrinsics #32512

tannergooding opened this issue Feb 18, 2020 · 16 comments · Fixed by #35612
Assignees
Labels
api-approved API was approved in API review, it can be implemented arch-arm64 area-System.Runtime.Intrinsics

Comments

@tannergooding
Copy link
Member

tannergooding commented Feb 18, 2020

namespace System.Runtime.Intrinsics.Arm
{
    public abstract class AdvSimd
    {
        /// <summary>
        /// Add returning high narrow
        /// For each element result[elem] = UpperHalf(left[elem] + right[elem])
        /// Corresponds to vector forms of ADDHN and VADDHN
        /// </summary>
        Vector64<byte> AddHigh(Vector64<ushort> left, Vector64<ushort> right);
        Vector64<sbyte> AddHigh(Vector64<short> left, Vector64<short> right);
        Vector64<short> AddHigh(Vector64<int> left, Vector64<int> right);
        Vector64<ushort> AddHigh(Vector64<uint> left, Vector64<uint> right);
        Vector64<int> AddHigh(Vector64<long> left, Vector64<long> right);
        Vector64<uint> AddHigh(Vector64<ulong> left, Vector64<ulong> right);

        /// <summary>
        /// Rounding add returning high narrow
        /// For each element result[elem] = UpperHalf(left[elem] + right[elem] + 1)
        /// Corresponds to vector forms of RADDHN and VRADHN
        /// </summary>
        Vector64<byte> RoundedAddHigh(Vector64<ushort> left, Vector64<ushort> right);
        Vector64<sbyte> RoundedAddHigh(Vector64<short> left, Vector64<short> right);
        Vector64<short> RoundedAddHigh(Vector64<int> left, Vector64<int> right);
        Vector64<ushort> RoundedAddHigh(Vector64<uint> left, Vector64<uint> right);
        Vector64<int> RoundedAddHigh(Vector64<long> left, Vector64<long> right);
        Vector64<uint> RoundedAddHigh(Vector64<ulong> left, Vector64<ulong> right);

        /// <summary>
        /// Vector halving add
        /// For each element result[elem] = (left[elem] + right[elem]) / 2
        /// Corresponds to vector forms of SHADD/UHADD and VHADD
        /// Supports byte, sbyte, short, ushort, int, uint
        /// </summary>
        Vector64<T> HalvingAdd(Vector64<T> left, Vector64<T> right);
        Vector128<T> HalvingAdd(Vector128<T> left, Vector128<T> right);

        /// <summary>
        /// Vector rounding halving add
        /// For each element result[elem] = (left[elem] + right[elem] + 1) / 2
        /// Corresponds to vector forms of SRHADD/URHADD and VRHADD
        /// Supports byte, sbyte, short, ushort, int, uint
        /// </summary>
        Vector64<T> RoundedHalvingAdd(Vector64<T> left, Vector64<T> right);
        Vector128<T> RoundedHalvingAdd(Vector128<T> left, Vector128<T> right);

        /// <summary>
        /// Vector saturating add
        /// For each element result[elem] = (left[elem] - right[elem]) / 2
        /// Corresponds to vector forms of SQADD/UQADD and VQADD
        /// Supports byte, sbyte, short, ushort, int, uint
        /// Additionally supports long, ulong for V128
        /// </summary>
        Vector64<T> AddSaturate(Vector64<T> left, Vector64<T> right);
        Vector128<T> AddSaturate(Vector128<T> left, Vector128<T> right);

        /// <summary>
        /// Subtract returning high narrow
        /// For each element result[elem] = UpperHalf(left[elem] - right[elem])
        /// Corresponds to vector forms of SUBHN and VSUBHN
        /// </summary>
        Vector64<byte> SubtractHigh(Vector64<ushort> left, Vector64<ushort> right);
        Vector64<sbyte> SubtractHigh(Vector64<short> left, Vector64<short> right);
        Vector64<short> SubtractHigh(Vector64<int> left, Vector64<int> right);
        Vector64<ushort> SubtractHigh(Vector64<uint> left, Vector64<uint> right);
        Vector64<int> SubtractHigh(Vector64<long> left, Vector64<long> right);
        Vector64<uint> SubtractHigh(Vector64<ulong> left, Vector64<ulong> right);

        /// <summary>
        /// Rounding subtract returning high narrow
        /// For each element result[elem] = UpperHalf(left[elem] - right[elem] + 1)
        /// Corresponds to vector forms of RSUBHN and VRSUBHN
        /// </summary>
        Vector64<byte> RoundedSubtractHigh(Vector64<ushort> left, Vector64<ushort> right);
        Vector64<sbyte> RoundedSubtractHigh(Vector64<short> left, Vector64<short> right);
        Vector64<short> RoundedSubtractHigh(Vector64<int> left, Vector64<int> right);
        Vector64<ushort> RoundedSubtractHigh(Vector64<uint> left, Vector64<uint> right);
        Vector64<int> RoundedSubtractHigh(Vector64<long> left, Vector64<long> right);
        Vector64<uint> RoundedSubtractHigh(Vector64<ulong> left, Vector64<ulong> right);

        /// <summary>
        /// Vector halving subtract
        /// For each element result[elem] = (left[elem] - right[elem]) / 2
        /// Corresponds to vector forms of SHSUB/UHSUB and VHSUB
        /// Supports byte, sbyte, short, ushort, int, uint
        /// </summary>
        Vector64<T> HalvingSubtract(Vector64<T> left, Vector64<T> right);
        Vector128<T> HalvingSubtract(Vector128<T> left, Vector128<T> right);

        /// <summary>
        /// Vector saturating subtract
        /// For each element result[elem] = (left[elem] - right[elem]) / 2
        /// Corresponds to vector forms of SQSUB/UQSUB and VQSUB
        /// Supports byte, sbyte, short, ushort, int, uint
        /// Additionally supports long, ulong for V128
        /// </summary>
        Vector64<T> SubtractSaturate(Vector64<T> left, Vector64<T> right);
        Vector128<T> SubtractSaturate(Vector128<T> left, Vector128<T> right);

        public abstract class Arm64
        {
            /// <summary>
            /// Add returning high narrow
            /// For each element result[elem] = UpperHalf(left[elem] + right[elem])
            /// Corresponds to vector froms of ADDHN2
            /// </summary>
            Vector128<byte> AddHighUpper(Vector64<byte> lower, Vector64<ushort> left, Vector64<ushort> right);
            Vector128<sbyte> AddHighUpper(Vector64<sbyte> lower, Vector64<short> left, Vector64<short> right);
            Vector128<short> AddHighUpper(Vector64<short> lower, Vector64<int> left, Vector64<int> right);
            Vector128<ushort> AddHighUpper(Vector64<ushort> lower, Vector64<uint> left, Vector64<uint> right);
            Vector128<int> AddHighUpper(Vector64<int> lower, Vector64<long> left, Vector64<long> right);
            Vector128<uint> AddHighUpper(Vector64<uint> lower, Vector64<ulong> left, Vector64<ulong> right);

            /// <summary>
            /// Rounding add returning high narrow
            /// For each element result[elem] = UpperHalf(left[elem] + right[elem] + 1)
            /// Corresponds to vector forms of RADDHN2
            /// </summary>
            Vector128<byte> RoundedAddHighUpper(Vector64<byte> lower, Vector64<ushort> left, Vector64<ushort> right);
            Vector128<sbyte> RoundedAddHighUpper(Vector64<sbyte> lower, Vector64<short> left, Vector64<short> right);
            Vector128<short> RoundedAddHighUpper(Vector64<short> lower, Vector64<int> left, Vector64<int> right);
            Vector128<ushort> RoundedAddHighUpper(Vector64<ushort> lower, Vector64<uint> left, Vector64<uint> right);
            Vector128<int> RoundedAddHighUpper(Vector64<int> lower, Vector64<long> left, Vector64<long> right);
            Vector128<uint> RoundedAddHighUpper(Vector64<uint> lower, Vector64<ulong> left, Vector64<ulong> right);

            /// <summary>
            /// Scalar saturating add
            /// For each element result[elem] = (left[elem] - right[elem]) / 2
            /// Corresponds to vector forms of SQADD/UQADD
            /// Supports byte, sbyte, short, ushort, int, uint, long, ulong
            /// </summary>
            Vector64<T> AddSaturateScalar(Vector64<T> left, Vector64<T> right);

            /// <summary>
            /// Subtract returning high narrow
            /// For each element result[elem] = UpperHalf(left[elem] - right[elem])
            /// Corresponds to vector forms of SUBHN2
            /// </summary>
            Vector128<byte> SubtractHighUpper(Vector64<byte> lower, Vector64<ushort> left, Vector64<ushort> right);
            Vector128<sbyte> SubtractHighUpper(Vector64<sbyte> lower, Vector64<short> left, Vector64<short> right);
            Vector128<short> SubtractHighUpper(Vector64<short> lower, Vector64<int> left, Vector64<int> right);
            Vector128<ushort> SubtractHighUpper(Vector64<ushort> lower, Vector64<uint> left, Vector64<uint> right);
            Vector128<int> SubtractHighUpper(Vector64<int> lower, Vector64<long> left, Vector64<long> right);
            Vector128<uint> SubtractHighUpper(Vector64<uint> lower, Vector64<ulong> left, Vector64<ulong> right);

            /// <summary>
            /// Rounding subtract returning high narrow
            /// For each element result[elem] = UpperHalf(left[elem] - right[elem] + 1)
            /// Corresponds to vector forms of RSUBHN2
            /// </summary>
            Vector128<byte> RoundedSubtractHighUpper(Vector64<byte> lower, Vector64<ushort> left, Vector64<ushort> right);
            Vector128<sbyte> RoundedSubtractHighUpper(Vector64<sbyte> lower, Vector64<short> left, Vector64<short> right);
            Vector128<short> RoundedSubtractHighUpper(Vector64<short> lower, Vector64<int> left, Vector64<int> right);
            Vector128<ushort> RoundedSubtractHighUpper(Vector64<ushort> lower, Vector64<uint> left, Vector64<uint> right);
            Vector128<int> RoundedSubtractHighUpper(Vector64<int> lower, Vector64<long> left, Vector64<long> right);
            Vector128<uint> RoundedSubtractHighUpper(Vector64<uint> lower, Vector64<ulong> left, Vector64<ulong> right);

            /// <summary>
            /// Scalar saturating subtract
            /// For each element result[elem] = (left[elem] - right[elem]) / 2
            /// Corresponds to vector forms of SQSUB/UQSUB
            /// Supports byte, sbyte, short, ushort, int, uint, long, ulong
            /// </summary>
            Vector64<T> SubtractSaturateScalar(Vector64<T> left, Vector64<T> right);

            /// <summary>
            /// Signed/Unsigned Add and Accumulate Pairwise
            /// For each element result[elem % 2] = value[elem % 2] + value[elem + 1]
            /// Corresponds to vector forms of SADDLP and UADDLP
            /// </summary>
            public static Vector128<short>  AddPairwiseAndWiden(Vector64<sbyte>  value) { throw null; }
            public static Vector128<ushort> AddPairwiseAndWiden(Vector64<byte>   value) { throw null; }
            public static Vector128<int>    AddPairwiseAndWiden(Vector64<short>  value) { throw null; }
            public static Vector128<uint>   AddPairwiseAndWiden(Vector64<ushort> value) { throw null; }
            public static Vector128<long>   AddPairwiseAndWiden(Vector64<int>    value) { throw null; }
            public static Vector128<ulong>  AddPairwiseAndWiden(Vector64<uint>   value) { throw null; }

            /// <summary>
            /// Signed/Unsigned Add and Accumulate Pairwise
            /// For each element result[elem % 2] = addend[elem % 2] + (value[elem] + value[elem + 1])
            /// Corresponds to vector forms of SADALP and UADALP
            /// </summary>
            public static Vector128<short>  AddPairwiseAndWiden(Vector128<short>  addend, Vector64<sbyte>  value) { throw null; }
            public static Vector128<ushort> AddPairwiseAndWiden(Vector128<ushort> addend, Vector64<byte>   value) { throw null; }
            public static Vector128<int>    AddPairwiseAndWiden(Vector128<int>    addend, Vector64<short>  value) { throw null; }
            public static Vector128<uint>   AddPairwiseAndWiden(Vector128<uint>   addend, Vector64<ushort> value) { throw null; }
            public static Vector128<long>   AddPairwiseAndWiden(Vector128<long>   addend, Vector64<int>    value) { throw null; }
            public static Vector128<ulong>  AddPairwiseAndWiden(Vector128<ulong>  addend, Vector64<uint>   value) { throw null; }

            /// <summary>
            /// Signed/Unsigned Absolute Difference and Accumulate Long
            /// For each element result[elem] = left[elem] + right[elem]
            /// Corresponds to vector forms of SABAL and UABAL
            /// </summary>
            public static Vector128<ushort> AbsoluteDifferenceAndWidenLow(Vector64<byte>   left, Vector64<byte>   right) { throw null; }
            public static Vector128<ushort> AbsoluteDifferenceAndWidenLow(Vector64<sbyte>  left, Vector64<sbyte>  right) { throw null; }
            public static Vector128<uint>   AbsoluteDifferenceAndWidenLow(Vector64<ushort> left, Vector64<ushort> right) { throw null; }
            public static Vector128<uint>   AbsoluteDifferenceAndWidenLow(Vector64<short>  left, Vector64<short>  right) { throw null; }
            public static Vector128<ulong>  AbsoluteDifferenceAndWidenLow(Vector64<uint>   left, Vector64<uint>   right) { throw null; }
            public static Vector128<ulong>  AbsoluteDifferenceAndWidenLow(Vector64<int>    left, Vector64<int>    right) { throw null; }

            /// <summary>
            /// Signed/Unsigned Absolute Difference Long
            /// For each element result[elem] = left[elem] + right[elem]
            /// Corresponds to vector forms of SABAL2 and UABAL2
            /// </summary>
            public static Vector128<ushort> AbsoluteDifferenceAndWidenHigh(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
            public static Vector128<ushort> AbsoluteDifferenceAndWidenHigh(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<uint>   AbsoluteDifferenceAndWidenHigh(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
            public static Vector128<uint>   AbsoluteDifferenceAndWidenHigh(Vector128<short>  left, Vector128<short>  right) { throw null; }
            public static Vector128<ulong>  AbsoluteDifferenceAndWidenHigh(Vector128<uint>   left, Vector128<uint>   right) { throw null; }
            public static Vector128<ulong>  AbsoluteDifferenceAndWidenHigh(Vector128<int>    left, Vector128<int>    right) { throw null; }

            /// <summary>
            /// Signed/Unsigned Absolute Difference and Accumulate Long
            /// For each element result[elem] = left[elem] + right[elem]
            /// Corresponds to vector forms of SABAL and UABAL
            /// </summary>
            public static Vector128<ushort> AbsoluteDifferenceAddAndWidenLow(Vector128<ushort> addend, Vector64<byte>   left, Vector64<byte>   right) { throw null; }
            public static Vector128<ushort> AbsoluteDifferenceAddAndWidenLow(Vector128<ushort> addend, Vector64<sbyte>  left, Vector64<sbyte>  right) { throw null; }
            public static Vector128<uint>   AbsoluteDifferenceAddAndWidenLow(Vector128<uint>   addend, Vector64<ushort> left, Vector64<ushort> right) { throw null; }
            public static Vector128<uint>   AbsoluteDifferenceAddAndWidenLow(Vector128<uint>   addend, Vector64<short>  left, Vector64<short>  right) { throw null; }
            public static Vector128<ulong>  AbsoluteDifferenceAddAndWidenLow(Vector128<ulong>  addend, Vector64<uint>   left, Vector64<uint>   right) { throw null; }
            public static Vector128<ulong>  AbsoluteDifferenceAddAndWidenLow(Vector128<ulong>  addend, Vector64<int>    left, Vector64<int>    right) { throw null; }

            /// <summary>
            /// Signed/Unsigned Absolute Difference and Accumulate Long
            /// For each element result[elem] = left[elem] + right[elem]
            /// Corresponds to vector forms of SABAL2 and UABAL2
            /// </summary>
            public static Vector128<ushort> AbsoluteDifferenceAddAndWidenHigh(Vector128<ushort> addend, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
            public static Vector128<ushort> AbsoluteDifferenceAddAndWidenHigh(Vector128<ushort> addend, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<uint>   AbsoluteDifferenceAddAndWidenHigh(Vector128<uint>   addend, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
            public static Vector128<uint>   AbsoluteDifferenceAddAndWidenHigh(Vector128<uint>   addend, Vector128<short>  left, Vector128<short>  right) { throw null; }
            public static Vector128<ulong>  AbsoluteDifferenceAddAndWidenHigh(Vector128<ulong>  addend, Vector128<uint>   left, Vector128<uint>   right) { throw null; }
            public static Vector128<ulong>  AbsoluteDifferenceAddAndWidenHigh(Vector128<ulong>  addend, Vector128<int>    left, Vector128<int>    right) { throw null; }

            /// <summary>
            /// Signed Add Long
            /// For each element result[elem] = left[elem] + right[elem]
            /// Corresponds to vector forms of SADDL and UADDL
            /// </summary>
            public static Vector128<short>  AddAndWidenLow(Vector64<sbyte>  left, Vector64<sbyte>  right) { throw null; }
            public static Vector128<ushort> AddAndWidenLow(Vector64<byte>   left, Vector64<byte>   right) { throw null; }
            public static Vector128<int>    AddAndWidenLow(Vector64<short>  left, Vector64<short>  right) { throw null; }
            public static Vector128<uint>   AddAndWidenLow(Vector64<ushort> left, Vector64<ushort> right) { throw null; }
            public static Vector128<long>   AddAndWidenLow(Vector64<int>    left, Vector64<int>    right) { throw null; }
            public static Vector128<ulong>  AddAndWidenLow(Vector64<uint>   left, Vector64<uint>   right) { throw null; }

            /// <summary>
            /// Signed Add Long
            /// For each element result[elem] = left[elem] + right[elem]
            /// Corresponds to vector forms of SADDL2 and UADDL2
            /// </summary>
            public static Vector128<short>  AddAndWidenHigh(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<ushort> AddAndWidenHigh(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
            public static Vector128<int>    AddAndWidenHigh(Vector128<short>  left, Vector128<short>  right) { throw null; }
            public static Vector128<uint>   AddAndWidenHigh(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
            public static Vector128<long>   AddAndWidenHigh(Vector128<int>    left, Vector128<int>    right) { throw null; }
            public static Vector128<ulong>  AddAndWidenHigh(Vector128<uint>   left, Vector128<uint>   right) { throw null; }

            /// <summary>
            /// Signed Add Wide
            /// For each element result[elem] = left[elem] + right[elem]
            /// Corresponds to vector forms of SADDW and UADDW
            /// </summary>
            public static Vector128<short>  AddAndWidenLow(Vector128<short>   left, Vector64<sbyte>  right) { throw null; }
            public static Vector128<ushort> AddAndWidenLow(Vector128<ushort>  left, Vector64<byte>   right) { throw null; }
            public static Vector128<int>    AddAndWidenLow(Vector128<int>     left, Vector64<short>  right) { throw null; }
            public static Vector128<uint>   AddAndWidenLow(Vector128<uint>    left, Vector64<ushort> right) { throw null; }
            public static Vector128<long>   AddAndWidenLow(Vector128<long>    left, Vector64<int>    right) { throw null; }
            public static Vector128<ulong>  AddAndWidenLow(Vector128<ulong>   left, Vector64<uint>   right) { throw null; }

            /// <summary>
            /// Signed Add Wide
            /// For each element result[elem] = left[elem] + right[elem]
            /// Corresponds to vector forms of SADDW2 and UADDW2
            /// </summary>
            public static Vector128<short>  AddAndWidenHigh(Vector128<short>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<ushort> AddAndWidenHigh(Vector128<ushort> left, Vector128<byte>   right) { throw null; }
            public static Vector128<int>    AddAndWidenHigh(Vector128<int>    left, Vector128<short>  right) { throw null; }
            public static Vector128<uint>   AddAndWidenHigh(Vector128<uint>   left, Vector128<ushort> right) { throw null; }
            public static Vector128<long>   AddAndWidenHigh(Vector128<long>   left, Vector128<int>    right) { throw null; }
            public static Vector128<ulong>  AddAndWidenHigh(Vector128<ulong>  left, Vector128<uint>   right) { throw null; }

            /// <summary>
            /// Signed Subtract Long
            /// For each element result[elem] = left[elem] - right[elem]
            /// Corresponds to vector forms of SSUBL and USUBL
            /// </summary>
            public static Vector128<short>  SubtractAndWidenLow(Vector64<sbyte>  left, Vector64<sbyte>  right) { throw null; }
            public static Vector128<ushort> SubtractAndWidenLow(Vector64<byte>   left, Vector64<byte>   right) { throw null; }
            public static Vector128<int>    SubtractAndWidenLow(Vector64<short>  left, Vector64<short>  right) { throw null; }
            public static Vector128<uint>   SubtractAndWidenLow(Vector64<ushort> left, Vector64<ushort> right) { throw null; }
            public static Vector128<long>   SubtractAndWidenLow(Vector64<int>    left, Vector64<int>    right) { throw null; }
            public static Vector128<ulong>  SubtractAndWidenLow(Vector64<uint>   left, Vector64<uint>   right) { throw null; }

            /// <summary>
            /// Signed Subtract Long
            /// For each element result[elem] = left[elem] - right[elem]
            /// Corresponds to vector forms of SSUBL2 and USUBL2
            /// </summary>
            public static Vector128<short>  SubtractAndWidenHigh(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<ushort> SubtractAndWidenHigh(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
            public static Vector128<int>    SubtractAndWidenHigh(Vector128<short>  left, Vector128<short>  right) { throw null; }
            public static Vector128<uint>   SubtractAndWidenHigh(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
            public static Vector128<long>   SubtractAndWidenHigh(Vector128<int>    left, Vector128<int>    right) { throw null; }
            public static Vector128<ulong>  SubtractAndWidenHigh(Vector128<uint>   left, Vector128<uint>   right) { throw null; }

            /// <summary>
            /// Signed Subtract Wide
            /// For each element result[elem] = left[elem] - right[elem]
            /// Corresponds to vector forms of SSUBW and USUBW
            /// </summary>
            public static Vector128<short>  SubtractAndWidenLow(Vector128<short>   left, Vector64<sbyte>  right) { throw null; }
            public static Vector128<ushort> SubtractAndWidenLow(Vector128<ushort>  left, Vector64<byte>   right) { throw null; }
            public static Vector128<int>    SubtractAndWidenLow(Vector128<int>     left, Vector64<short>  right) { throw null; }
            public static Vector128<uint>   SubtractAndWidenLow(Vector128<uint>    left, Vector64<ushort> right) { throw null; }
            public static Vector128<long>   SubtractAndWidenLow(Vector128<long>    left, Vector64<int>    right) { throw null; }
            public static Vector128<ulong>  SubtractAndWidenLow(Vector128<ulong>   left, Vector64<uint>   right) { throw null; }

            /// <summary>
            /// Signed Subtract Wide
            /// For each element result[elem] = left[elem] - right[elem]
            /// Corresponds to vector forms of SSUBW2 and USUBW2
            /// </summary>
            public static Vector128<short>  SubtractAndWidenHigh(Vector128<short>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<ushort> SubtractAndWidenHigh(Vector128<ushort> left, Vector128<byte>   right) { throw null; }
            public static Vector128<int>    SubtractAndWidenHigh(Vector128<int>    left, Vector128<short>  right) { throw null; }
            public static Vector128<uint>   SubtractAndWidenHigh(Vector128<uint>   left, Vector128<ushort> right) { throw null; }
            public static Vector128<long>   SubtractAndWidenHigh(Vector128<long>   left, Vector128<int>    right) { throw null; }
            public static Vector128<ulong>  SubtractAndWidenHigh(Vector128<ulong>  left, Vector128<uint>   right) { throw null; }


            /// <summary>
            /// Signed Multiply Long
            /// For each element result[elem] = left[elem] * right[elem]
            /// Corresponds to vector forms of SMULL and UMULL
            /// </summary>
            public static Vector128<short>  MultiplyAndWidenLow(Vector64<sbyte>  left, Vector64<sbyte>  right) { throw null; }
            public static Vector128<ushort> MultiplyAndWidenLow(Vector64<byte>   left, Vector64<byte>   right) { throw null; }
            public static Vector128<int>    MultiplyAndWidenLow(Vector64<short>  left, Vector64<short>  right) { throw null; }
            public static Vector128<uint>   MultiplyAndWidenLow(Vector64<ushort> left, Vector64<ushort> right) { throw null; }
            public static Vector128<long>   MultiplyAndWidenLow(Vector64<int>    left, Vector64<int>    right) { throw null; }
            public static Vector128<ulong>  MultiplyAndWidenLow(Vector64<uint>   left, Vector64<uint>   right) { throw null; }

            /// <summary>
            /// Signed Multiply Long
            /// For each element result[elem] = left[elem] * right[elem]
            /// Corresponds to vector forms of SMULL2 and UMULL2
            /// </summary>
            public static Vector128<short>  MultiplyAndWidenHigh(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<ushort> MultiplyAndWidenHigh(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
            public static Vector128<int>    MultiplyAndWidenHigh(Vector128<short>  left, Vector128<short>  right) { throw null; }
            public static Vector128<uint>   MultiplyAndWidenHigh(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
            public static Vector128<long>   MultiplyAndWidenHigh(Vector128<int>    left, Vector128<int>    right) { throw null; }
            public static Vector128<ulong>  MultiplyAndWidenHigh(Vector128<uint>   left, Vector128<uint>   right) { throw null; }

            /// <summary>
            /// Signed Multiply and Add Long
            /// For each element result[elem] = addend[elem] + left[elem] * right[elem]
            /// Corresponds to vector forms of SMLAL and UMLAL
            /// </summary>
            public static Vector128<short>  MultiplyAddAndWidenLow(Vector128<short>  addend, Vector64<sbyte>  left, Vector64<sbyte>  right) { throw null; }
            public static Vector128<ushort> MultiplyAddAndWidenLow(Vector128<ushort> addend, Vector64<byte>   left, Vector64<byte>   right) { throw null; }
            public static Vector128<int>    MultiplyAddAndWidenLow(Vector128<int>    addend, Vector64<short>  left, Vector64<short>  right) { throw null; }
            public static Vector128<uint>   MultiplyAddAndWidenLow(Vector128<uint>   addend, Vector64<ushort> left, Vector64<ushort> right) { throw null; }
            public static Vector128<long>   MultiplyAddAndWidenLow(Vector128<long>   addend, Vector64<int>    left, Vector64<int>    right) { throw null; }
            public static Vector128<ulong>  MultiplyAddAndWidenLow(Vector128<ulong>  addend, Vector64<uint>   left, Vector64<uint>   right) { throw null; }

            /// <summary>
            /// Signed Multiply and Add Long
            /// For each element result[elem] = addend[elem] + left[elem] * right[elem]
            /// Corresponds to vector forms of SMLAL2 and UMLAL2
            /// </summary>
            public static Vector128<short>  MultiplyAddAndWidenHigh(Vector128<short>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<ushort> MultiplyAddAndWidenHigh(Vector128<ushort> addend, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
            public static Vector128<int>    MultiplyAddAndWidenHigh(Vector128<int>    addend, Vector128<short>  left, Vector128<short>  right) { throw null; }
            public static Vector128<uint>   MultiplyAddAndWidenHigh(Vector128<uint>   addend, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
            public static Vector128<long>   MultiplyAddAndWidenHigh(Vector128<long>   addend, Vector128<int>    left, Vector128<int>    right) { throw null; }
            public static Vector128<ulong>  MultiplyAddAndWidenHigh(Vector128<ulong>  addend, Vector128<uint>   left, Vector128<uint>   right) { throw null; }

            /// <summary>
            /// Signed Multiply and Add Long
            /// For each element result[elem] = minuend[elem] -left[elem] * right[elem]
            /// Corresponds to vector forms of SMLSL and UMLSL
            /// </summary>
            public static Vector128<short>  MultiplySubtractAndWidenLow(Vector128<short>  minuend, Vector64<sbyte>  left, Vector64<sbyte>  right) { throw null; }
            public static Vector128<ushort> MultiplySubtractAndWidenLow(Vector128<ushort> minuend, Vector64<byte>   left, Vector64<byte>   right) { throw null; }
            public static Vector128<int>    MultiplySubtractAndWidenLow(Vector128<int>    minuend, Vector64<short>  left, Vector64<short>  right) { throw null; }
            public static Vector128<uint>   MultiplySubtractAndWidenLow(Vector128<uint>   minuend, Vector64<ushort> left, Vector64<ushort> right) { throw null; }
            public static Vector128<long>   MultiplySubtractAndWidenLow(Vector128<long>   minuend, Vector64<int>    left, Vector64<int>    right) { throw null; }
            public static Vector128<ulong>  MultiplySubtractAndWidenLow(Vector128<ulong>  minuend, Vector64<uint>   left, Vector64<uint>   right) { throw null; }

            /// <summary>
            /// Signed Multiply and Subtract Long
            /// For each element result[elem] = minuend[elem] - left[elem] * right[elem]
            /// Corresponds to vector forms of SMLSL2 and UMLSL2
            /// </summary>
            public static Vector128<short>  MultiplySubtractAndWidenHigh(Vector128<short>  minuend, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<ushort> MultiplySubtractAndWidenHigh(Vector128<ushort> minuend, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
            public static Vector128<int>    MultiplySubtractAndWidenHigh(Vector128<int>    minuend, Vector128<short>  left, Vector128<short>  right) { throw null; }
            public static Vector128<uint>   MultiplySubtractAndWidenHigh(Vector128<uint>   minuend, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
            public static Vector128<long>   MultiplySubtractAndWidenHigh(Vector128<long>   minuend, Vector128<int>    left, Vector128<int>    right) { throw null; }
            public static Vector128<ulong>  MultiplySubtractAndWidenHigh(Vector128<ulong>  minuend, Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        }
    }
}
@Dotnet-GitSync-Bot Dotnet-GitSync-Bot added the untriaged New issue has not been triaged by the area owner label Feb 18, 2020
@tannergooding tannergooding added api-ready-for-review area-System.Runtime.Intrinsics and removed untriaged New issue has not been triaged by the area owner labels Feb 18, 2020
@tannergooding
Copy link
Member Author

CC. @CarolEidt, @echesakovMSFT, @TamarChristinaArm

This is a proposal for some of the remaining intrinsics that aren't yet in an existing proposal. I expect 2-3 more issues in the coming days to get put up before all have been covered.

@tannergooding tannergooding changed the title ARM additional addition and subtraction intrinsics ARM additional arithmetic intrinsics Mar 11, 2020
@tannergooding
Copy link
Member Author

Added in some more arithmetic intrinsics.

@terrajobst terrajobst added api-approved API was approved in API review, it can be implemented and removed api-ready-for-review labels Mar 17, 2020
@terrajobst
Copy link
Member

terrajobst commented Mar 17, 2020

Video

  • Looks good
  • We made some substantial changing in order of words
  • AbsoluteDifferenceAdd in API Proposal: More SIMD HW Intrinsics #24794 should take Vector128<T> rather than Vector64<T>
    • Actually it's correct
  • The Halving methods are fused, so we should prefix them with Fused
namespace System.Runtime.Intrinsics.Arm
{
    public partial class AdvSimd
    {
        public Vector64<byte>   AddLowerReturningUpper(Vector64<ushort> left, Vector64<ushort> right);
        public Vector64<sbyte>  AddLowerReturningUpper(Vector64<short>  left, Vector64<short>  right);
        public Vector64<short>  AddLowerReturningUpper(Vector64<int>    left, Vector64<int>    right);
        public Vector64<ushort> AddLowerReturningUpper(Vector64<uint>   left, Vector64<uint>   right);
        public Vector64<int>    AddLowerReturningUpper(Vector64<long>   left, Vector64<long>   right);
        public Vector64<uint>   AddLowerReturningUpper(Vector64<ulong>  left, Vector64<ulong>  right);

        public Vector64<byte>   AddLowerRoundedReturningUpper(Vector64<ushort> left, Vector64<ushort> right);
        public Vector64<sbyte>  AddLowerRoundedReturningUpper(Vector64<short>  left, Vector64<short>  right);
        public Vector64<short>  AddLowerRoundedReturningUpper(Vector64<int>    left, Vector64<int>    right);
        public Vector64<ushort> AddLowerRoundedReturningUpper(Vector64<uint>   left, Vector64<uint>   right);
        public Vector64<int>    AddLowerRoundedReturningUpper(Vector64<long>   left, Vector64<long>   right);
        public Vector64<uint>   AddLowerRoundedReturningUpper(Vector64<ulong>  left, Vector64<ulong>  right);

        public Vector64<T>  FusedAddHalving(Vector64<T>  left, Vector64<T>  right);
        public Vector128<T> FusedAddHalving(Vector128<T> left, Vector128<T> right);

        public Vector64<T>  FusedAddRoundedHalving(Vector64<T>  left, Vector64<T>  right);
        public Vector128<T> FusedAddRoundedHalving(Vector128<T> left, Vector128<T> right);

        public Vector64<T>  AddSaturate(Vector64<T>  left, Vector64<T>  right);
        public Vector128<T> AddSaturate(Vector128<T> left, Vector128<T> right);

        public Vector64<byte>   SubtractLowerReturningUpper(Vector64<ushort> left, Vector64<ushort> right);
        public Vector64<sbyte>  SubtractLowerReturningUpper(Vector64<short>  left, Vector64<short>  right);
        public Vector64<short>  SubtractLowerReturningUpper(Vector64<int>    left, Vector64<int>    right);
        public Vector64<ushort> SubtractLowerReturningUpper(Vector64<uint>   left, Vector64<uint>   right);
        public Vector64<int>    SubtractLowerReturningUpper(Vector64<long>   left, Vector64<long>   right);
        public Vector64<uint>   SubtractLowerReturningUpper(Vector64<ulong>  left, Vector64<ulong>  right);

        public Vector64<byte>   SubtractLowerRoundedReturningUpper(Vector64<ushort> left, Vector64<ushort> right);
        public Vector64<sbyte>  SubtractLowerRoundedReturningUpper(Vector64<short>  left, Vector64<short>  right);
        public Vector64<short>  SubtractLowerRoundedReturningUpper(Vector64<int>    left, Vector64<int>    right);
        public Vector64<ushort> SubtractLowerRoundedReturningUpper(Vector64<uint>   left, Vector64<uint>   right);
        public Vector64<int>    SubtractLowerRoundedReturningUpper(Vector64<long>   left, Vector64<long>   right);
        public Vector64<uint>   SubtractLowerRoundedReturningUpper(Vector64<ulong>  left, Vector64<ulong>  right);

        public Vector64<T>  FusedSubtractHalving(Vector64<T>  left, Vector64<T>  right);
        public Vector128<T> FusedSubtractHalving(Vector128<T> left, Vector128<T> right);

        public Vector64<T>  SubtractSaturate(Vector64<T>  left, Vector64<T>  right);
        public Vector128<T> SubtractSaturate(Vector128<T> left, Vector128<T> right);

        public partial class Arm64
        {
            public Vector128<byte>   AddUpperReturningUpper(Vector64<byte>   lower, Vector128<ushort> left, Vector128<ushort> right);
            public Vector128<sbyte>  AddUpperReturningUpper(Vector64<sbyte>  lower, Vector128<short>  left, Vector128<short>  right);
            public Vector128<short>  AddUpperReturningUpper(Vector64<short>  lower, Vector128<int>    left, Vector128<int>    right);
            public Vector128<ushort> AddUpperReturningUpper(Vector64<ushort> lower, Vector128<uint>   left, Vector128<uint>   right);
            public Vector128<int>    AddUpperReturningUpper(Vector64<int>    lower, Vector128<long>   left, Vector128<long>   right);
            public Vector128<uint>   AddUpperReturningUpper(Vector64<uint>   lower, Vector128<ulong>  left, Vector128<ulong>  right);

            public Vector128<byte>   AddUpperRoundedReturningUpper(Vector64<byte>   lower, Vector128<ushort> left, Vector128<ushort> right);
            public Vector128<sbyte>  AddUpperRoundedReturningUpper(Vector64<sbyte>  lower, Vector128<short>  left, Vector128<short>  right);
            public Vector128<short>  AddUpperRoundedReturningUpper(Vector64<short>  lower, Vector128<int>    left, Vector128<int>    right);
            public Vector128<ushort> AddUpperRoundedReturningUpper(Vector64<ushort> lower, Vector128<uint>   left, Vector128<uint>   right);
            public Vector128<int>    AddUpperRoundedReturningUpper(Vector64<int>    lower, Vector128<long>   left, Vector128<long>   right);
            public Vector128<uint>   AddUpperRoundedReturningUpper(Vector64<uint>   lower, Vector128<ulong>  left, Vector128<ulong>  right);

            public Vector64<T> AddSaturateScalar(Vector64<T> left, Vector64<T> right);

            public Vector128<byte>   SubtractUpperReturningUpper(Vector64<byte>   lower, Vector128<ushort> left, Vector128<ushort> right);
            public Vector128<sbyte>  SubtractUpperReturningUpper(Vector64<sbyte>  lower, Vector128<short>  left, Vector128<short>  right);
            public Vector128<short>  SubtractUpperReturningUpper(Vector64<short>  lower, Vector128<int>    left, Vector128<int>    right);
            public Vector128<ushort> SubtractUpperReturningUpper(Vector64<ushort> lower, Vector128<uint>   left, Vector128<uint>   right);
            public Vector128<int>    SubtractUpperReturningUpper(Vector64<int>    lower, Vector128<long>   left, Vector128<long>   right);
            public Vector128<uint>   SubtractUpperReturningUpper(Vector64<uint>   lower, Vector128<ulong>  left, Vector128<ulong>  right);

            public Vector128<byte>   SubtractUpperRoundedReturningUpper(Vector64<byte>   lower, Vector128<ushort> left, Vector128<ushort> right);
            public Vector128<sbyte>  SubtractUpperRoundedReturningUpper(Vector64<sbyte>  lower, Vector128<short>  left, Vector128<short>  right);
            public Vector128<short>  SubtractUpperRoundedReturningUpper(Vector64<short>  lower, Vector128<int>    left, Vector128<int>    right);
            public Vector128<ushort> SubtractUpperRoundedReturningUpper(Vector64<ushort> lower, Vector128<uint>   left, Vector128<uint>   right);
            public Vector128<int>    SubtractUpperRoundedReturningUpper(Vector64<int>    lower, Vector128<long>   left, Vector128<long>   right);
            public Vector128<uint>   SubtractUpperRoundedReturningUpper(Vector64<uint>   lower, Vector128<ulong>  left, Vector128<ulong>  right);

            public Vector64<T> SubtractSaturateScalar(Vector64<T> left, Vector64<T> right);

            public static Vector128<short>  AddPairwiseWidening(Vector64<sbyte>  value);
            public static Vector128<ushort> AddPairwiseWidening(Vector64<byte>   value);
            public static Vector128<int>    AddPairwiseWidening(Vector64<short>  value);
            public static Vector128<uint>   AddPairwiseWidening(Vector64<ushort> value);
            public static Vector128<long>   AddPairwiseWidening(Vector64<int>    value);
            public static Vector128<ulong>  AddPairwiseWidening(Vector64<uint>   value);

            public static Vector128<short>  AddPairwiseWideningAndAdd(Vector128<short>  addend, Vector64<sbyte>  value);
            public static Vector128<ushort> AddPairwiseWideningAndAdd(Vector128<ushort> addend, Vector64<byte>   value);
            public static Vector128<int>    AddPairwiseWideningAndAdd(Vector128<int>    addend, Vector64<short>  value);
            public static Vector128<uint>   AddPairwiseWideningAndAdd(Vector128<uint>   addend, Vector64<ushort> value);
            public static Vector128<long>   AddPairwiseWideningAndAdd(Vector128<long>   addend, Vector64<int>    value);
            public static Vector128<ulong>  AddPairwiseWideningAndAdd(Vector128<ulong>  addend, Vector64<uint>   value);

            public static Vector128<ushort> AbsoluteDifferenceWideningLower(Vector128<byte>   left, Vector64<byte>   right);
            public static Vector128<ushort> AbsoluteDifferenceWideningLower(Vector128<sbyte>  left, Vector64<sbyte>  right);
            public static Vector128<uint>   AbsoluteDifferenceWideningLower(Vector128<ushort> left, Vector64<ushort> right);
            public static Vector128<uint>   AbsoluteDifferenceWideningLower(Vector128<short>  left, Vector64<short>  right);
            public static Vector128<ulong>  AbsoluteDifferenceWideningLower(Vector128<uint>   left, Vector64<uint>   right);
            public static Vector128<ulong>  AbsoluteDifferenceWideningLower(Vector128<int>    left, Vector64<int>    right);

            public static Vector128<ushort> AbsoluteDifferenceWideningUpper(Vector128<byte>   left, Vector128<byte>   right);
            public static Vector128<ushort> AbsoluteDifferenceWideningUpper(Vector128<sbyte>  left, Vector128<sbyte>  right);
            public static Vector128<uint>   AbsoluteDifferenceWideningUpper(Vector128<ushort> left, Vector128<ushort> right);
            public static Vector128<uint>   AbsoluteDifferenceWideningUpper(Vector128<short>  left, Vector128<short>  right);
            public static Vector128<ulong>  AbsoluteDifferenceWideningUpper(Vector128<uint>   left, Vector128<uint>   right);
            public static Vector128<ulong>  AbsoluteDifferenceWideningUpper(Vector128<int>    left, Vector128<int>    right);

            public static Vector128<ushort> AbsoluteDifferenceWideningLowerAndAdd(Vector128<ushort> addend, Vector64<byte>   left, Vector64<byte>   right);
            public static Vector128<ushort> AbsoluteDifferenceWideningLowerAndAdd(Vector128<ushort> addend, Vector64<sbyte>  left, Vector64<sbyte>  right);
            public static Vector128<uint>   AbsoluteDifferenceWideningLowerAndAdd(Vector128<uint>   addend, Vector64<ushort> left, Vector64<ushort> right);
            public static Vector128<uint>   AbsoluteDifferenceWideningLowerAndAdd(Vector128<uint>   addend, Vector64<short>  left, Vector64<short>  right);
            public static Vector128<ulong>  AbsoluteDifferenceWideningLowerAndAdd(Vector128<ulong>  addend, Vector64<uint>   left, Vector64<uint>   right);
            public static Vector128<ulong>  AbsoluteDifferenceWideningLowerAndAdd(Vector128<ulong>  addend, Vector64<int>    left, Vector64<int>    right);

            public static Vector128<ushort> AbsoluteDifferenceWideningUpperAndAdd(Vector128<ushort> addend, Vector128<byte>   left, Vector128<byte>   right);
            public static Vector128<ushort> AbsoluteDifferenceWideningUpperAndAdd(Vector128<ushort> addend, Vector128<sbyte>  left, Vector128<sbyte>  right);
            public static Vector128<uint>   AbsoluteDifferenceWideningUpperAndAdd(Vector128<uint>   addend, Vector128<ushort> left, Vector128<ushort> right);
            public static Vector128<uint>   AbsoluteDifferenceWideningUpperAndAdd(Vector128<uint>   addend, Vector128<short>  left, Vector128<short>  right);
            public static Vector128<ulong>  AbsoluteDifferenceWideningUpperAndAdd(Vector128<ulong>  addend, Vector128<uint>   left, Vector128<uint>   right);
            public static Vector128<ulong>  AbsoluteDifferenceWideningUpperAndAdd(Vector128<ulong>  addend, Vector128<int>    left, Vector128<int>    right);

            public static Vector128<short>  AddWideningLower(Vector64<sbyte>  left, Vector64<sbyte>  right);
            public static Vector128<ushort> AddWideningLower(Vector64<byte>   left, Vector64<byte>   right);
            public static Vector128<int>    AddWideningLower(Vector64<short>  left, Vector64<short>  right);
            public static Vector128<uint>   AddWideningLower(Vector64<ushort> left, Vector64<ushort> right);
            public static Vector128<long>   AddWideningLower(Vector64<int>    left, Vector64<int>    right);
            public static Vector128<ulong>  AddWideningLower(Vector64<uint>   left, Vector64<uint>   right);

            public static Vector128<short>  AddWideningUpper(Vector128<sbyte>  left, Vector128<sbyte>  right);
            public static Vector128<ushort> AddWideningUpper(Vector128<byte>   left, Vector128<byte>   right);
            public static Vector128<int>    AddWideningUpper(Vector128<short>  left, Vector128<short>  right);
            public static Vector128<uint>   AddWideningUpper(Vector128<ushort> left, Vector128<ushort> right);
            public static Vector128<long>   AddWideningUpper(Vector128<int>    left, Vector128<int>    right);
            public static Vector128<ulong>  AddWideningUpper(Vector128<uint>   left, Vector128<uint>   right);

            public static Vector128<short>  AddWideningLower(Vector128<short>   left, Vector64<sbyte>  right);
            public static Vector128<ushort> AddWideningLower(Vector128<ushort>  left, Vector64<byte>   right);
            public static Vector128<int>    AddWideningLower(Vector128<int>     left, Vector64<short>  right);
            public static Vector128<uint>   AddWideningLower(Vector128<uint>    left, Vector64<ushort> right);
            public static Vector128<long>   AddWideningLower(Vector128<long>    left, Vector64<int>    right);
            public static Vector128<ulong>  AddWideningLower(Vector128<ulong>   left, Vector64<uint>   right);

            public static Vector128<short>  AddWideningUpper(Vector128<short>  left, Vector128<sbyte>  right);
            public static Vector128<ushort> AddWideningUpper(Vector128<ushort> left, Vector128<byte>   right);
            public static Vector128<int>    AddWideningUpper(Vector128<int>    left, Vector128<short>  right);
            public static Vector128<uint>   AddWideningUpper(Vector128<uint>   left, Vector128<ushort> right);
            public static Vector128<long>   AddWideningUpper(Vector128<long>   left, Vector128<int>    right);
            public static Vector128<ulong>  AddWideningUpper(Vector128<ulong>  left, Vector128<uint>   right);

            public static Vector128<short>  SubtractWideningLower(Vector64<sbyte>  left, Vector64<sbyte>  right);
            public static Vector128<ushort> SubtractWideningLower(Vector64<byte>   left, Vector64<byte>   right);
            public static Vector128<int>    SubtractWideningLower(Vector64<short>  left, Vector64<short>  right);
            public static Vector128<uint>   SubtractWideningLower(Vector64<ushort> left, Vector64<ushort> right);
            public static Vector128<long>   SubtractWideningLower(Vector64<int>    left, Vector64<int>    right);
            public static Vector128<ulong>  SubtractWideningLower(Vector64<uint>   left, Vector64<uint>   right);

            public static Vector128<short>  SubtractWideningUpper(Vector128<sbyte>  left, Vector128<sbyte>  right);
            public static Vector128<ushort> SubtractWideningUpper(Vector128<byte>   left, Vector128<byte>   right);
            public static Vector128<int>    SubtractWideningUpper(Vector128<short>  left, Vector128<short>  right);
            public static Vector128<uint>   SubtractWideningUpper(Vector128<ushort> left, Vector128<ushort> right);
            public static Vector128<long>   SubtractWideningUpper(Vector128<int>    left, Vector128<int>    right);
            public static Vector128<ulong>  SubtractWideningUpper(Vector128<uint>   left, Vector128<uint>   right);

            public static Vector128<short>  SubtractWideningLower(Vector128<short>   left, Vector64<sbyte>  right);
            public static Vector128<ushort> SubtractWideningLower(Vector128<ushort>  left, Vector64<byte>   right);
            public static Vector128<int>    SubtractWideningLower(Vector128<int>     left, Vector64<short>  right);
            public static Vector128<uint>   SubtractWideningLower(Vector128<uint>    left, Vector64<ushort> right);
            public static Vector128<long>   SubtractWideningLower(Vector128<long>    left, Vector64<int>    right);
            public static Vector128<ulong>  SubtractWideningLower(Vector128<ulong>   left, Vector64<uint>   right);

            public static Vector128<short>  SubtractWideningUpper(Vector128<short>  left, Vector128<sbyte>  right);
            public static Vector128<ushort> SubtractWideningUpper(Vector128<ushort> left, Vector128<byte>   right);
            public static Vector128<int>    SubtractWideningUpper(Vector128<int>    left, Vector128<short>  right);
            public static Vector128<uint>   SubtractWideningUpper(Vector128<uint>   left, Vector128<ushort> right);
            public static Vector128<long>   SubtractWideningUpper(Vector128<long>   left, Vector128<int>    right);
            public static Vector128<ulong>  SubtractWideningUpper(Vector128<ulong>  left, Vector128<uint>   right);

            public static Vector128<short>  MultiplyWideningLower(Vector64<sbyte>  left, Vector64<sbyte>  right);
            public static Vector128<ushort> MultiplyWideningLower(Vector64<byte>   left, Vector64<byte>   right);
            public static Vector128<int>    MultiplyWideningLower(Vector64<short>  left, Vector64<short>  right);
            public static Vector128<uint>   MultiplyWideningLower(Vector64<ushort> left, Vector64<ushort> right);
            public static Vector128<long>   MultiplyWideningLower(Vector64<int>    left, Vector64<int>    right);
            public static Vector128<ulong>  MultiplyWideningLower(Vector64<uint>   left, Vector64<uint>   right);

            public static Vector128<short>  MultiplyWideningUpper(Vector128<sbyte>  left, Vector128<sbyte>  right);
            public static Vector128<ushort> MultiplyWideningUpper(Vector128<byte>   left, Vector128<byte>   right);
            public static Vector128<int>    MultiplyWideningUpper(Vector128<short>  left, Vector128<short>  right);
            public static Vector128<uint>   MultiplyWideningUpper(Vector128<ushort> left, Vector128<ushort> right);
            public static Vector128<long>   MultiplyWideningUpper(Vector128<int>    left, Vector128<int>    right);
            public static Vector128<ulong>  MultiplyWideningUpper(Vector128<uint>   left, Vector128<uint>   right);

            public static Vector128<short>  MultiplyWideningLowerAndAdd(Vector128<short>  addend, Vector64<sbyte>  left, Vector64<sbyte>  right);
            public static Vector128<ushort> MultiplyWideningLowerAndAdd(Vector128<ushort> addend, Vector64<byte>   left, Vector64<byte>   right);
            public static Vector128<int>    MultiplyWideningLowerAndAdd(Vector128<int>    addend, Vector64<short>  left, Vector64<short>  right);
            public static Vector128<uint>   MultiplyWideningLowerAndAdd(Vector128<uint>   addend, Vector64<ushort> left, Vector64<ushort> right);
            public static Vector128<long>   MultiplyWideningLowerAndAdd(Vector128<long>   addend, Vector64<int>    left, Vector64<int>    right);
            public static Vector128<ulong>  MultiplyWideningLowerAndAdd(Vector128<ulong>  addend, Vector64<uint>   left, Vector64<uint>   right);

            public static Vector128<short>  MultiplyWideningUpperAndAdd(Vector128<short>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right);
            public static Vector128<ushort> MultiplyWideningUpperAndAdd(Vector128<ushort> addend, Vector128<byte>   left, Vector128<byte>   right);
            public static Vector128<int>    MultiplyWideningUpperAndAdd(Vector128<int>    addend, Vector128<short>  left, Vector128<short>  right);
            public static Vector128<uint>   MultiplyWideningUpperAndAdd(Vector128<uint>   addend, Vector128<ushort> left, Vector128<ushort> right);
            public static Vector128<long>   MultiplyWideningUpperAndAdd(Vector128<long>   addend, Vector128<int>    left, Vector128<int>    right);
            public static Vector128<ulong>  MultiplyWideningUpperAndAdd(Vector128<ulong>  addend, Vector128<uint>   left, Vector128<uint>   right);

            public static Vector128<short>  MultiplyWideningLowerAndSubtract(Vector128<short>  minuend, Vector64<sbyte>  left, Vector64<sbyte>  right);
            public static Vector128<ushort> MultiplyWideningLowerAndSubtract(Vector128<ushort> minuend, Vector64<byte>   left, Vector64<byte>   right);
            public static Vector128<int>    MultiplyWideningLowerAndSubtract(Vector128<int>    minuend, Vector64<short>  left, Vector64<short>  right);
            public static Vector128<uint>   MultiplyWideningLowerAndSubtract(Vector128<uint>   minuend, Vector64<ushort> left, Vector64<ushort> right);
            public static Vector128<long>   MultiplyWideningLowerAndSubtract(Vector128<long>   minuend, Vector64<int>    left, Vector64<int>    right);
            public static Vector128<ulong>  MultiplyWideningLowerAndSubtract(Vector128<ulong>  minuend, Vector64<uint>   left, Vector64<uint>   right);

            public static Vector128<short>  MultiplyWideningUpperAndSubtract(Vector128<short>  minuend, Vector128<sbyte>  left, Vector128<sbyte>  right);
            public static Vector128<ushort> MultiplyWideningUpperAndSubtract(Vector128<ushort> minuend, Vector128<byte>   left, Vector128<byte>   right);
            public static Vector128<int>    MultiplyWideningUpperAndSubtract(Vector128<int>    minuend, Vector128<short>  left, Vector128<short>  right);
            public static Vector128<uint>   MultiplyWideningUpperAndSubtract(Vector128<uint>   minuend, Vector128<ushort> left, Vector128<ushort> right);
            public static Vector128<long>   MultiplyWideningUpperAndSubtract(Vector128<long>   minuend, Vector128<int>    left, Vector128<int>    right);
            public static Vector128<ulong>  MultiplyWideningUpperAndSubtract(Vector128<ulong>  minuend, Vector128<uint>   left, Vector128<uint>   right);
        }
    }
}

@tannergooding
Copy link
Member Author

The following intrinsics should be under AdvSimd not AdvSimd.Arm64 based on https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics

  • SABAL
  • SADDL
  • SADDW
  • SMLAL
  • SMLSL
  • SMULL
  • SSUBL
  • SSUBW
  • UABAL
  • UADDL
  • UADDW
  • UMLAL
  • UMLSL
  • UMULL
  • USUBL
  • USUBW
  • SADALP
  • SADDLP
  • UADALP
  • UADDLP

These are all "lower" variants. The "upper" variants are Arm64 only.

CC. @echesakovMSFT, @TamarChristinaArm

@TamarChristinaArm
Copy link
Contributor

@tannergooding same here. I believe some, if not all of these are implementable on AArch32 as well. I will go through them.

@echesakov
Copy link
Contributor

echesakov commented Apr 16, 2020

I believe
AbsoluteDifferenceWideningUpperAndAdd (SABAL2, UABAL2) can also be implemented on Arm32 as

VABAL Qd, Dn, Dm

where Dn, Dm corresponds to upper parts of Vector128<T> left and right arguments.

I believe the reason why Arm64 introduces SABAL2, UABAL2 is that you can't access upper 64-bit parts of 128-bit vector registers using 64-bit register names - while on Arm32 D_{2n}, D_{2n+1} corresponds to lower and upper parts of Q_{n}.

@echesakov
Copy link
Contributor

@TamarChristinaArm Are you working on these or have started implementing them? If not, I can take these as my next workitem since I am little bit reluctant to start working on the remaining intrinsics in #24794 (*BySelectedScalar) before we agreed on their names

@echesakov
Copy link
Contributor

It looks that AddLowerReturningUpper should operate on Vector128<T>?

        public Vector64<byte>   AddLowerReturningUpper(Vector128<ushort> left, Vector128<ushort> right);
        public Vector64<sbyte>  AddLowerReturningUpper(Vector128<short>  left, Vector128<short>  right);
        public Vector64<short>  AddLowerReturningUpper(Vector128<int>    left, Vector128<int>    right);
        public Vector64<ushort> AddLowerReturningUpper(Vector128<uint>   left, Vector128<uint>   right);
        public Vector64<int>    AddLowerReturningUpper(Vector128<long>   left, Vector128<long>   right);
        public Vector64<uint>   AddLowerReturningUpper(Vector128<ulong>  left, Vector128<ulong>  right);

and AddLowerRoundedReturningUpper as well

        public Vector64<byte>   AddLowerRoundedReturningUpper(Vector128<ushort> left, Vector128<ushort> right);
        public Vector64<sbyte>  AddLowerRoundedReturningUpper(Vector128<short>  left, Vector128<short>  right);
        public Vector64<short>  AddLowerRoundedReturningUpper(Vector128<int>    left, Vector128<int>    right);
        public Vector64<ushort> AddLowerRoundedReturningUpper(Vector128<uint>   left, Vector128<uint>   right);
        public Vector64<int>    AddLowerRoundedReturningUpper(Vector128<long>   left, Vector128<long>   right);
        public Vector64<uint>   AddLowerRoundedReturningUpper(Vector128<ulong>  left, Vector128<ulong>  right);

@echesakov
Copy link
Contributor

Based on the description of ADDHN{2} and RADDHN{2}

places the most significant
half of the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&FP register.

shouldn't these be AddUpperReturningLower and AddUpperRoundedReturningUpper and even these names are confusing since they imply that the addition operation happens only on upper part of the source registers - which is not true - since we are adding whole registers together and extract upper parts only after that.

@tannergooding
Copy link
Member Author

Yeah, I think those names were likely messed up during the API review as the general terms are confusing.

They AddReturningUpper and write the result to the Lower/Upper halves of the result register and I'm not sure of a good name for them that isn't equally confusing

@echesakov
Copy link
Contributor

AddHighNarrowingToUpper AddHighNarrowingToLower?

@tannergooding
Copy link
Member Author

That doesn't fit with the general naming convention we've used elsewhere

@TamarChristinaArm
Copy link
Contributor

@TamarChristinaArm Are you working on these or have started implementing them? If not, I can take these as my next workitem since I am little bit reluctant to start working on the remaining intrinsics in #24794 (*BySelectedScalar) before we agreed on their names

No I'm not, I'm currently doing the single register TBL/TBX and the SLI/SRI ones.

@echesakov echesakov self-assigned this Apr 17, 2020
@echesakov
Copy link
Contributor

That doesn't fit with the general naming convention we've used elsewhere

@tannergooding Can you please add these to the next API meeting agenda? These should be clarification what names to use here.

I don't want to block on this so I am going to start implementing the intrinsics and as the temporary names use AddReturningHighNarrowUpper and AddReturningHighNarrowLower. When the names are finalized - these can be updated with simple search and replace.

@echesakov
Copy link
Contributor

I went through all of the instrinsic - it looks that only AddSaturateScalar and SubtractSaturateScalar for (8, 16 and 32 byte arguments) are A64-only. Everything else can be implemented on both platforms.

Below I attached the corrected API and added the corresponding С++ intrinsic name and the instructions with operands on A32 and A64.

class AdvSimd.Arm64
{
  // uint8_t vqaddb_u8 (uint8_t a, uint8_t b)
  //   A64: UQADD Bd, Bn, Bm
  public static Vector64<byte> AddSaturateScalar(Vector64<byte> left, Vector64<byte> right);

  // int16_t vqaddh_s16 (int16_t a, int16_t b)
  //   A64: SQADD Hd, Hn, Hm
  public static Vector64<short> AddSaturateScalar(Vector64<short> left, Vector64<short> right);

  // int32_t vqadds_s32 (int32_t a, int32_t b)
  //   A64: SQADD Sd, Sn, Sm
  public static Vector64<int> AddSaturateScalar(Vector64<int> left, Vector64<int> right);

  // int8_t vqaddb_s8 (int8_t a, int8_t b)
  //   A64: SQADD Bd, Bn, Bm
  public static Vector64<sbyte> AddSaturateScalar(Vector64<sbyte> left, Vector64<sbyte> right);

  // uint16_t vqaddh_u16 (uint16_t a, uint16_t b)
  //   A64: UQADD Hd, Hn, Hm
  public static Vector64<ushort> AddSaturateScalar(Vector64<ushort> left, Vector64<ushort> right);

  // uint32_t vqadds_u32 (uint32_t a, uint32_t b)
  //   A64: UQADD Sd, Sn, Sm
  public static Vector64<uint> AddSaturateScalar(Vector64<uint> left, Vector64<uint> right);

  // uint8_t vqsubb_u8 (uint8_t a, uint8_t b)
  //   A64: UQSUB Bd, Bn, Bm
  public static Vector64<byte> SubtractSaturateScalar(Vector64<byte> left, Vector64<byte> right);

  // int16_t vqsubh_s16 (int16_t a, int16_t b)
  //   A64: SQSUB Hd, Hn, Hm
  public static Vector64<short> SubtractSaturateScalar(Vector64<short> left, Vector64<short> right);

  // int32_t vqsubs_s32 (int32_t a, int32_t b)
  //   A64: SQSUB Sd, Sn, Sm
  public static Vector64<int> SubtractSaturateScalar(Vector64<int> left, Vector64<int> right);

  // int8_t vqsubb_s8 (int8_t a, int8_t b)
  //   A64: SQSUB Bd, Bn, Bm
  public static Vector64<sbyte> SubtractSaturateScalar(Vector64<sbyte> left, Vector64<sbyte> right);

  // uint16_t vqsubh_u16 (uint16_t a, uint16_t b)
  //   A64: UQSUB Hd, Hn, Hm
  public static Vector64<ushort> SubtractSaturateScalar(Vector64<ushort> left, Vector64<ushort> right);

  // uint32_t vqsubs_u32 (uint32_t a, uint32_t b)
  //   A64: UQSUB Sd, Sn, Sm
  public static Vector64<uint> SubtractSaturateScalar(Vector64<uint> left, Vector64<uint> right);
}
class AdvSimd
{
  // uint16x8_t vabdl_u8 (uint8x8_t a, uint8x8_t b)
  //   A32: VABDL.U8 Qd, Dn, Dm
  //   A64: UABDL Vd.8H, Vn.8B, Vm.8B
  public static Vector128<ushort> AbsoluteDifferenceWideningLower(Vector64<byte> left, Vector64<byte> right);

  // int32x4_t vabdl_s16 (int16x4_t a, int16x4_t b)
  //   A32: VABDL.S16 Qd, Dn, Dm
  //   A64: SABDL Vd.4S, Vn.4H, Vm.4H
  public static Vector128<int> AbsoluteDifferenceWideningLower(Vector64<short> left, Vector64<short> right);

  // int64x2_t vabdl_s32 (int32x2_t a, int32x2_t b)
  //   A32: VABDL.S32 Qd, Dn, Dm
  //   A64: SABDL Vd.2D, Vn.2S, Vm.2S
  public static Vector128<long> AbsoluteDifferenceWideningLower(Vector64<int> left, Vector64<int> right);

  // int16x8_t vabdl_s8 (int8x8_t a, int8x8_t b)
  //   A32: VABDL.S8 Qd, Dn, Dm
  //   A64: SABDL Vd.8H, Vn.8B, Vm.8B
  public static Vector128<short> AbsoluteDifferenceWideningLower(Vector64<sbyte> left, Vector64<sbyte> right);

  // uint32x4_t vabdl_u16 (uint16x4_t a, uint16x4_t b)
  //   A32: VABDL.U16 Qd, Dn, Dm
  //   A64: UABDL Vd.4S, Vn.4H, Vm.4H
  public static Vector128<uint> AbsoluteDifferenceWideningLower(Vector64<ushort> left, Vector64<ushort> right);

  // uint64x2_t vabdl_u32 (uint32x2_t a, uint32x2_t b)
  //   A32: VABDL.U32 Qd, Dn, Dm
  //   A64: UABDL Vd.2D, Vn.2S, Vm.2S
  public static Vector128<ulong> AbsoluteDifferenceWideningLower(Vector64<uint> left, Vector64<uint> right);

  // int16x8_t vabal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
  //   A32: VABAL.S8 Qd, Dn, Dm
  //   A64: SABAL Vd.8H, Vn.8B, Vm.8B
  public static Vector128<short> AbsoluteDifferenceWideningLowerAndAdd(Vector128<short> addend, Vector64<sbyte> left, Vector64<sbyte> right);

  // int32x4_t vabal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
  //   A32: VABAL.S16 Qd, Dn, Dm
  //   A64: SABAL Vd.4S, Vn.4H, Vm.4H
  public static Vector128<int> AbsoluteDifferenceWideningLowerAndAdd(Vector128<int> addend, Vector64<short> left, Vector64<short> right);

  // int64x2_t vabal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
  //   A32: VABAL.S32 Qd, Dn, Dm
  //   A64: SABAL Vd.2D, Vn.2S, Vm.2S
  public static Vector128<long> AbsoluteDifferenceWideningLowerAndAdd(Vector128<long> addend, Vector64<int> left, Vector64<int> right);

  // uint16x8_t vabal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
  //   A32: VABAL.U8 Qd, Dn, Dm
  //   A64: UABAL Vd.8H, Vn.8B, Vm.8B
  public static Vector128<ushort> AbsoluteDifferenceWideningLowerAndAdd(Vector128<ushort> addend, Vector64<byte> left, Vector64<byte> right);

  // uint32x4_t vabal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
  //   A32: VABAL.U16 Qd, Dn, Dm
  //   A64: UABAL Vd.4S, Vn.4H, Vm.4H
  public static Vector128<uint> AbsoluteDifferenceWideningLowerAndAdd(Vector128<uint> addend, Vector64<ushort> left, Vector64<ushort> right);

  // uint64x2_t vabal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
  //   A32: VABAL.U32 Qd, Dn, Dm
  //   A64: UABAL Vd.2D, Vn.2S, Vm.2S
  public static Vector128<ulong> AbsoluteDifferenceWideningLowerAndAdd(Vector128<ulong> addend, Vector64<uint> left, Vector64<uint> right);

  // uint16x8_t vabdl_high_u8 (uint8x16_t a, uint8x16_t b)
  //   A32: VABDL.U8 Qd, Dn+1, Dm+1
  //   A64: UABDL2 Vd.8H, Vn.16B, Vm.16B
  public static Vector128<ushort> AbsoluteDifferenceWideningUpper(Vector128<byte> left, Vector128<byte> right);

  // int32x4_t vabdl_high_s16 (int16x8_t a, int16x8_t b)
  //   A32: VABDL.S16 Qd, Dn+1, Dm+1
  //   A64: SABDL2 Vd.4S, Vn.8H, Vm.8H
  public static Vector128<int> AbsoluteDifferenceWideningUpper(Vector128<short> left, Vector128<short> right);

  // int64x2_t vabdl_high_s32 (int32x4_t a, int32x4_t b)
  //   A32: VABDL.S32 Qd, Dn+1, Dm+1
  //   A64: SABDL2 Vd.2D, Vn.4S, Vm.4S
  public static Vector128<long> AbsoluteDifferenceWideningUpper(Vector128<int> left, Vector128<int> right);

  // int16x8_t vabdl_high_s8 (int8x16_t a, int8x16_t b)
  //   A32: VABDL.S8 Qd, Dn+1, Dm+1
  //   A64: SABDL2 Vd.8H, Vn.16B, Vm.16B
  public static Vector128<short> AbsoluteDifferenceWideningUpper(Vector128<sbyte> left, Vector128<sbyte> right);

  // uint32x4_t vabdl_high_u16 (uint16x8_t a, uint16x8_t b)
  //   A32: VABDL.U16 Qd, Dn+1, Dm+1
  //   A64: UABDL2 Vd.4S, Vn.8H, Vm.8H
  public static Vector128<uint> AbsoluteDifferenceWideningUpper(Vector128<ushort> left, Vector128<ushort> right);

  // uint64x2_t vabdl_high_u32 (uint32x4_t a, uint32x4_t b)
  //   A32: VABDL.U32 Qd, Dn+1, Dm+1
  //   A64: UABDL2 Vd.2D, Vn.4S, Vm.4S
  public static Vector128<ulong> AbsoluteDifferenceWideningUpper(Vector128<uint> left, Vector128<uint> right);

  // int16x8_t vabal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
  //   A32: VABAL.S8 Qd, Dn+1, Dm+1
  //   A64: SABAL2 Vd.8H, Vn.16B, Vm.16B
  public static Vector128<short> AbsoluteDifferenceWideningUpperAndAdd(Vector128<short> addend, Vector128<sbyte> left, Vector128<sbyte> right);

  // int32x4_t vabal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
  //   A32: VABAL.S16 Qd, Dn+1, Dm+1
  //   A64: SABAL2 Vd.4S, Vn.8H, Vm.8H
  public static Vector128<int> AbsoluteDifferenceWideningUpperAndAdd(Vector128<int> addend, Vector128<short> left, Vector128<short> right);

  // int64x2_t vabal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
  //   A32: VABAL.S32 Qd, Dn+1, Dm+1
  //   A64: SABAL2 Vd.2D, Vn.4S, Vm.4S
  public static Vector128<long> AbsoluteDifferenceWideningUpperAndAdd(Vector128<long> addend, Vector128<int> left, Vector128<int> right);

  // uint16x8_t vabal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
  //   A32: VABAL.U8 Qd, Dn+1, Dm+1
  //   A64: UABAL2 Vd.8H, Vn.16B, Vm.16B
  public static Vector128<ushort> AbsoluteDifferenceWideningUpperAndAdd(Vector128<ushort> addend, Vector128<byte> left, Vector128<byte> right);

  // uint32x4_t vabal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
  //   A32: VABAL.U16 Qd, Dn+1, Dm+1
  //   A64: UABAL2 Vd.4S, Vn.8H, Vm.8H
  public static Vector128<uint> AbsoluteDifferenceWideningUpperAndAdd(Vector128<uint> addend, Vector128<ushort> left, Vector128<ushort> right);

  // uint64x2_t vabal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
  //   A32: VABAL.U32 Qd, Dn+1, Dm+1
  //   A64: UABAL2 Vd.2D, Vn.4S, Vm.4S
  public static Vector128<ulong> AbsoluteDifferenceWideningUpperAndAdd(Vector128<ulong> addend, Vector128<uint> left, Vector128<uint> right);

  // int16x4_t vpaddl_s8 (int8x8_t a)
  //   A32: VPADDL.S8 Dd, Dm
  //   A64: SADDLP Vd.4H, Vn.8B
  public static Vector64<short> AddPairwiseWidening(Vector64<sbyte> value);

  // int32x2_t vpaddl_s16 (int16x4_t a)
  //   A32: VPADDL.S16 Dd, Dm
  //   A64: SADDLP Vd.2S, Vn.4H
  public static Vector64<int> AddPairwiseWidening(Vector64<short> value);

  // uint16x4_t vpaddl_u8 (uint8x8_t a)
  //   A32: VPADDL.U8 Dd, Dm
  //   A64: UADDLP Vd.4H, Vn.8B
  public static Vector64<ushort> AddPairwiseWidening(Vector64<byte> value);

  // uint32x2_t vpaddl_u16 (uint16x4_t a)
  //   A32: VPADDL.U16 Dd, Dm
  //   A64: UADDLP Vd.2S, Vn.4H
  public static Vector64<uint> AddPairwiseWidening(Vector64<ushort> value);

  // int16x8_t vpaddlq_s8 (int8x16_t a)
  //   A32: VPADDL.S8 Qd, Qm
  //   A64: SADDLP Vd.8H, Vn.16B
  public static Vector128<short> AddPairwiseWidening(Vector128<sbyte> value);

  // int32x4_t vpaddlq_s16 (int16x8_t a)
  //   A32: VPADDL.S16 Qd, Qm
  //   A64: SADDLP Vd.4S, Vn.8H
  public static Vector128<int> AddPairwiseWidening(Vector128<short> value);

  // int64x2_t vpaddlq_s32 (int32x4_t a)
  //   A32: VPADDL.S32 Qd, Qm
  //   A64: SADDLP Vd.2D, Vn.4S
  public static Vector128<long> AddPairwiseWidening(Vector128<int> value);

  // uint16x8_t vpaddlq_u8 (uint8x16_t a)
  //   A32: VPADDL.U8 Qd, Qm
  //   A64: UADDLP Vd.8H, Vn.16B
  public static Vector128<ushort> AddPairwiseWidening(Vector128<byte> value);

  // uint32x4_t vpaddlq_u16 (uint16x8_t a)
  //   A32: VPADDL.U16 Qd, Qm
  //   A64: UADDLP Vd.4S, Vn.8H
  public static Vector128<uint> AddPairwiseWidening(Vector128<ushort> value);

  // uint64x2_t vpaddlq_u32 (uint32x4_t a)
  //   A32: VPADDL.U32 Qd, Qm
  //   A64: UADDLP Vd.2D, Vn.4S
  public static Vector128<ulong> AddPairwiseWidening(Vector128<uint> value);

  // int16x4_t vpadal_s8 (int16x4_t a, int8x8_t b)
  //   A32: VPADAL.S8 Dd, Dm
  //   A64: SADALP Vd.4H, Vn.8B
  public static Vector64<short> AddPairwiseWideningAndAdd(Vector64<short> addend, Vector64<sbyte> value);

  // int32x2_t vpadal_s16 (int32x2_t a, int16x4_t b)
  //   A32: VPADAL.S16 Dd, Dm
  //   A64: SADALP Vd.2S, Vn.4H
  public static Vector64<int> AddPairwiseWideningAndAdd(Vector64<int> addend, Vector64<short> value);

  // uint16x4_t vpadal_u8 (uint16x4_t a, uint8x8_t b)
  //   A32: VPADAL.U8 Dd, Dm
  //   A64: UADALP Vd.4H, Vn.8B
  public static Vector64<ushort> AddPairwiseWideningAndAdd(Vector64<ushort> addend, Vector64<byte> value);

  // uint32x2_t vpadal_u16 (uint32x2_t a, uint16x4_t b)
  //   A32: VPADAL.U16 Dd, Dm
  //   A64: UADALP Vd.2S, Vn.4H
  public static Vector64<uint> AddPairwiseWideningAndAdd(Vector64<uint> addend, Vector64<ushort> value);

  // int16x8_t vpadalq_s8 (int16x8_t a, int8x16_t b)
  //   A32: VPADAL.S8 Qd, Qm
  //   A64: SADALP Vd.8H, Vn.16B
  public static Vector128<short> AddPairwiseWideningAndAdd(Vector128<short> addend, Vector128<sbyte> value);

  // int32x4_t vpadalq_s16 (int32x4_t a, int16x8_t b)
  //   A32: VPADAL.S16 Qd, Qm
  //   A64: SADALP Vd.4S, Vn.8H
  public static Vector128<int> AddPairwiseWideningAndAdd(Vector128<int> addend, Vector128<short> value);

  // int64x2_t vpadalq_s32 (int64x2_t a, int32x4_t b)
  //   A32: VPADAL.S32 Qd, Qm
  //   A64: SADALP Vd.2D, Vn.4S
  public static Vector128<long> AddPairwiseWideningAndAdd(Vector128<long> addend, Vector128<int> value);

  // uint16x8_t vpadalq_u8 (uint16x8_t a, uint8x16_t b)
  //   A32: VPADAL.U8 Qd, Qm
  //   A64: UADALP Vd.8H, Vn.16B
  public static Vector128<ushort> AddPairwiseWideningAndAdd(Vector128<ushort> addend, Vector128<byte> value);

  // uint32x4_t vpadalq_u16 (uint32x4_t a, uint16x8_t b)
  //   A32: VPADAL.U16 Qd, Qm
  //   A64: UADALP Vd.4S, Vn.8H
  public static Vector128<uint> AddPairwiseWideningAndAdd(Vector128<uint> addend, Vector128<ushort> value);

  // uint64x2_t vpadalq_u32 (uint64x2_t a, uint32x4_t b)
  //   A32: VPADAL.U32 Qd, Qm
  //   A64: UADALP Vd.2D, Vn.4S
  public static Vector128<ulong> AddPairwiseWideningAndAdd(Vector128<ulong> addend, Vector128<uint> value);

  // int64x1_t vpadal_s32 (int64x1_t a, int32x2_t b)
  //   A32: VPADAL.S32 Dd, Dm
  //   A64: SADALP Vd.1D, Vn.2S
  public static Vector64<long> AddPairwiseWideningAndAddScalar(Vector64<long> addend, Vector64<int> value);

  // uint64x1_t vpadal_u32 (uint64x1_t a, uint32x2_t b)
  //   A32: VPADAL.U32 Dd, Dm
  //   A64: UADALP Vd.1D, Vn.2S
  public static Vector64<ulong> AddPairwiseWideningAndAddScalar(Vector64<ulong> addend, Vector64<uint> value);

  // int64x1_t vpaddl_s32 (int32x2_t a)
  //   A32: VPADDL.S32 Dd, Dm
  //   A64: SADDLP Dd, Vn.2S
  public static Vector64<long> AddPairwiseWideningScalar(Vector64<int> value);

  // uint64x1_t vpaddl_u32 (uint32x2_t a)
  //   A32: VPADDL.U32 Dd, Dm
  //   A64: UADDLP Dd, Vn.2S
  public static Vector64<ulong> AddPairwiseWideningScalar(Vector64<uint> value);

  // int8x8_t vaddhn_s16 (int16x8_t a, int16x8_t b)
  //   A32: VADDHN.I16 Dd, Qn, Qm
  //   A64: ADDHN Vd.8B, Vn.8H, Vm.8H
  public static Vector64<sbyte> AddReturningHighNarrowLower(Vector128<short> left, Vector128<short> right);

  // int16x4_t vaddhn_s32 (int32x4_t a, int32x4_t b)
  //   A32: VADDHN.I32 Dd, Qn, Qm
  //   A64: ADDHN Vd.4H, Vn.4S, Vm.4S
  public static Vector64<short> AddReturningHighNarrowLower(Vector128<int> left, Vector128<int> right);

  // int32x2_t vaddhn_s64 (int64x2_t a, int64x2_t b)
  //   A32: VADDHN.I64 Dd, Qn, Qm
  //   A64: ADDHN Vd.2S, Vn.2D, Vm.2D
  public static Vector64<int> AddReturningHighNarrowLower(Vector128<long> left, Vector128<long> right);

  // uint8x8_t vaddhn_u16 (uint16x8_t a, uint16x8_t b)
  //   A32: VADDHN.I16 Dd, Qn, Qm
  //   A64: ADDHN Vd.8B, Vn.8H, Vm.8H
  public static Vector64<byte> AddReturningHighNarrowLower(Vector128<ushort> left, Vector128<ushort> right);

  // uint16x4_t vaddhn_u32 (uint32x4_t a, uint32x4_t b)
  //   A32: VADDHN.I32 Dd, Qn, Qm
  //   A64: ADDHN Vd.4H, Vn.4S, Vm.4S
  public static Vector64<ushort> AddReturningHighNarrowLower(Vector128<uint> left, Vector128<uint> right);

  // uint32x2_t vaddhn_u64 (uint64x2_t a, uint64x2_t b)
  //   A32: VADDHN.I64 Dd, Qn, Qm
  //   A64: ADDHN Vd.2S, Vn.2D, Vm.2D
  public static Vector64<uint> AddReturningHighNarrowLower(Vector128<ulong> left, Vector128<ulong> right);

  // int8x16_t vaddhn_high_s16 (int8x8_t r, int16x8_t a, int16x8_t b)
  //   A32: VADDHN.I16 Dd+1, Qn, Qm
  //   A64: ADDHN2 Vd.16B, Vn.8B, Vm.8H
  public static Vector128<sbyte> AddReturningHighNarrowUpper(Vector64<sbyte> lower, Vector128<short> left, Vector128<short> right);

  // int16x8_t vaddhn_high_s32 (int16x4_t r, int32x4_t a, int32x4_t b)
  //   A32: VADDHN.I32 Dd+1, Qn, Qm
  //   A64: ADDHN2 Vd.8H, Vn.4H, Vm.4S
  public static Vector128<short> AddReturningHighNarrowUpper(Vector64<short> lower, Vector128<int> left, Vector128<int> right);

  // int32x4_t vaddhn_high_s64 (int32x2_t r, int64x2_t a, int64x2_t b)
  //   A32: VADDHN.I64 Dd+1, Qn, Qm
  //   A64: ADDHN2 Vd.4S, Vn.2S, Vm.2D
  public static Vector128<int> AddReturningHighNarrowUpper(Vector64<int> lower, Vector128<long> left, Vector128<long> right);

  // uint8x16_t vaddhn_high_u16 (uint8x8_t r, uint16x8_t a, uint16x8_t b)
  //   A32: VADDHN.I16 Dd+1, Qn, Qm
  //   A64: ADDHN2 Vd.16B, Vn.8B, Vm.8H
  public static Vector128<byte> AddReturningHighNarrowUpper(Vector64<byte> lower, Vector128<ushort> left, Vector128<ushort> right);

  // uint16x8_t vaddhn_high_u32 (uint16x4_t r, uint32x4_t a, uint32x4_t b)
  //   A32: VADDHN.I32 Dd+1, Qn, Qm
  //   A64: ADDHN2 Vd.8H, Vn.4H, Vm.4S
  public static Vector128<ushort> AddReturningHighNarrowUpper(Vector64<ushort> lower, Vector128<uint> left, Vector128<uint> right);

  // uint32x4_t vaddhn_high_u64 (uint32x2_t r, uint64x2_t a, uint64x2_t b)
  //   A32: VADDHN.I64 Dd+1, Qn, Qm
  //   A64: ADDHN2 Vd.4S, Vn.2S, Vm.2D
  public static Vector128<uint> AddReturningHighNarrowUpper(Vector64<uint> lower, Vector128<ulong> left, Vector128<ulong> right);

  // int8x8_t vraddhn_s16 (int16x8_t a, int16x8_t b)
  //   A32: VRADDHN.I16 Dd, Qn, Qm
  //   A64: RADDHN Vd.8B, Vn.8H, Vm.8H
  public static Vector64<sbyte> AddReturningRoundedHighNarrowLower(Vector128<short> left, Vector128<short> right);

  // int16x4_t vraddhn_s32 (int32x4_t a, int32x4_t b)
  //   A32: VRADDHN.I32 Dd, Qn, Qm
  //   A64: RADDHN Vd.4H, Vn.4S, Vm.4S
  public static Vector64<short> AddReturningRoundedHighNarrowLower(Vector128<int> left, Vector128<int> right);

  // int32x2_t vraddhn_s64 (int64x2_t a, int64x2_t b)
  //   A32: VRADDHN.I64 Dd, Qn, Qm
  //   A64: RADDHN Vd.2S, Vn.2D, Vm.2D
  public static Vector64<int> AddReturningRoundedHighNarrowLower(Vector128<long> left, Vector128<long> right);

  // uint8x8_t vraddhn_u16 (uint16x8_t a, uint16x8_t b)
  //   A32: VRADDHN.I16 Dd, Qn, Qm
  //   A64: RADDHN Vd.8B, Vn.8H, Vm.8H
  public static Vector64<byte> AddReturningRoundedHighNarrowLower(Vector128<ushort> left, Vector128<ushort> right);

  // uint16x4_t vraddhn_u32 (uint32x4_t a, uint32x4_t b)
  //   A32: VRADDHN.I32 Dd, Qn, Qm
  //   A64: RADDHN Vd.4H, Vn.4S, Vm.4S
  public static Vector64<ushort> AddReturningRoundedHighNarrowLower(Vector128<uint> left, Vector128<uint> right);

  // uint32x2_t vraddhn_u64 (uint64x2_t a, uint64x2_t b)
  //   A32: VRADDHN.I64 Dd, Qn, Qm
  //   A64: RADDHN Vd.2S, Vn.2D, Vm.2D
  public static Vector64<uint> AddReturningRoundedHighNarrowLower(Vector128<ulong> left, Vector128<ulong> right);

  // int8x16_t vraddhn_high_s16 (int8x8_t r, int16x8_t a, int16x8_t b)
  //   A32: VRADDHN.I16 Dd+1, Qn, Qm
  //   A64: RADDHN2 Vd.16B, Vn.8B, Vm.8H
  public static Vector128<sbyte> AddReturningRoundedHighNarrowUpper(Vector64<sbyte> lower, Vector128<short> left, Vector128<short> right);

  // int16x8_t vraddhn_high_s32 (int16x4_t r, int32x4_t a, int32x4_t b)
  //   A32: VRADDHN.I32 Dd+1, Qn, Qm
  //   A64: RADDHN2 Vd.8H, Vn.4H, Vm.4S
  public static Vector128<short> AddReturningRoundedHighNarrowUpper(Vector64<short> lower, Vector128<int> left, Vector128<int> right);

  // int32x4_t vraddhn_high_s64 (int32x2_t r, int64x2_t a, int64x2_t b)
  //   A32: VRADDHN.I64 Dd+1, Qn, Qm
  //   A64: RADDHN2 Vd.4S, Vn.2S, Vm.2D
  public static Vector128<int> AddReturningRoundedHighNarrowUpper(Vector64<int> lower, Vector128<long> left, Vector128<long> right);

  // uint8x16_t vraddhn_high_u16 (uint8x8_t r, uint16x8_t a, uint16x8_t b)
  //   A32: VRADDHN.I16 Dd+1, Qn, Qm
  //   A64: RADDHN2 Vd.16B, Vn.8B, Vm.8H
  public static Vector128<byte> AddReturningRoundedHighNarrowUpper(Vector64<byte> lower, Vector128<ushort> left, Vector128<ushort> right);

  // uint16x8_t vraddhn_high_u32 (uint16x4_t r, uint32x4_t a, uint32x4_t b)
  //   A32: VRADDHN.I32 Dd+1, Qn, Qm
  //   A64: RADDHN2 Vd.8H, Vn.4H, Vm.4S
  public static Vector128<ushort> AddReturningRoundedHighNarrowUpper(Vector64<ushort> lower, Vector128<uint> left, Vector128<uint> right);

  // uint32x4_t vraddhn_high_u64 (uint32x2_t r, uint64x2_t a, uint64x2_t b)
  //   A32: VRADDHN.I64 Dd+1, Qn, Qm
  //   A64: RADDHN2 Vd.4S, Vn.2S, Vm.2D
  public static Vector128<uint> AddReturningRoundedHighNarrowUpper(Vector64<uint> lower, Vector128<ulong> left, Vector128<ulong> right);

  // uint8x8_t vqadd_u8 (uint8x8_t a, uint8x8_t b)
  //   A32: VQADD.U8 Dd, Dn, Dm
  //   A64: UQADD Vd.8B, Vn.8B, Vm.8B
  public static Vector64<byte> AddSaturate(Vector64<byte> left, Vector64<byte> right);

  // int16x4_t vqadd_s16 (int16x4_t a, int16x4_t b)
  //   A32: VQADD.S16 Dd, Dn, Dm
  //   A64: SQADD Vd.4H, Vn.4H, Vm.4H
  public static Vector64<short> AddSaturate(Vector64<short> left, Vector64<short> right);

  // int32x2_t vqadd_s32 (int32x2_t a, int32x2_t b)
  //   A32: VQADD.S32 Dd, Dn, Dm
  //   A64: SQADD Vd.2S, Vn.2S, Vm.2S
  public static Vector64<int> AddSaturate(Vector64<int> left, Vector64<int> right);

  // int8x8_t vqadd_s8 (int8x8_t a, int8x8_t b)
  //   A32: VQADD.S8 Dd, Dn, Dm
  //   A64: SQADD Vd.8B, Vn.8B, Vm.8B
  public static Vector64<sbyte> AddSaturate(Vector64<sbyte> left, Vector64<sbyte> right);

  // uint16x4_t vqadd_u16 (uint16x4_t a, uint16x4_t b)
  //   A32: VQADD.U16 Dd, Dn, Dm
  //   A64: UQADD Vd.4H, Vn.4H, Vm.4H
  public static Vector64<ushort> AddSaturate(Vector64<ushort> left, Vector64<ushort> right);

  // uint32x2_t vqadd_u32 (uint32x2_t a, uint32x2_t b)
  //   A32: VQADD.U32 Dd, Dn, Dm
  //   A64: UQADD Vd.2S, Vn.2S, Vm.2S
  public static Vector64<uint> AddSaturate(Vector64<uint> left, Vector64<uint> right);

  // uint8x16_t vqaddq_u8 (uint8x16_t a, uint8x16_t b)
  //   A32: VQADD.U8 Qd, Qn, Qm
  //   A64: UQADD Vd.16B, Vn.16B, Vm.16B
  public static Vector128<byte> AddSaturate(Vector128<byte> left, Vector128<byte> right);

  // int16x8_t vqaddq_s16 (int16x8_t a, int16x8_t b)
  //   A32: VQADD.S16 Qd, Qn, Qm
  //   A64: SQADD Vd.8H, Vn.8H, Vm.8H
  public static Vector128<short> AddSaturate(Vector128<short> left, Vector128<short> right);

  // int32x4_t vqaddq_s32 (int32x4_t a, int32x4_t b)
  //   A32: VQADD.S32 Qd, Qn, Qm
  //   A64: SQADD Vd.4S, Vn.4S, Vm.4S
  public static Vector128<int> AddSaturate(Vector128<int> left, Vector128<int> right);

  // int64x2_t vqaddq_s64 (int64x2_t a, int64x2_t b)
  //   A32: VQADD.S64 Qd, Qn, Qm
  //   A64: SQADD Vd.2D, Vn.2D, Vm.2D
  public static Vector128<long> AddSaturate(Vector128<long> left, Vector128<long> right);

  // int8x16_t vqaddq_s8 (int8x16_t a, int8x16_t b)
  //   A32: VQADD.S8 Qd, Qn, Qm
  //   A64: SQADD Vd.16B, Vn.16B, Vm.16B
  public static Vector128<sbyte> AddSaturate(Vector128<sbyte> left, Vector128<sbyte> right);

  // uint16x8_t vqaddq_u16 (uint16x8_t a, uint16x8_t b)
  //   A32: VQADD.U16 Qd, Qn, Qm
  //   A64: UQADD Vd.8H, Vn.8H, Vm.8H
  public static Vector128<ushort> AddSaturate(Vector128<ushort> left, Vector128<ushort> right);

  // uint32x4_t vqaddq_u32 (uint32x4_t a, uint32x4_t b)
  //   A32: VQADD.U32 Qd, Qn, Qm
  //   A64: UQADD Vd.4S, Vn.4S, Vm.4S
  public static Vector128<uint> AddSaturate(Vector128<uint> left, Vector128<uint> right);

  // uint64x2_t vqaddq_u64 (uint64x2_t a, uint64x2_t b)
  //   A32: VQADD.U64 Qd, Qn, Qm
  //   A64: UQADD Vd.2D, Vn.2D, Vm.2D
  public static Vector128<ulong> AddSaturate(Vector128<ulong> left, Vector128<ulong> right);

  // int64x1_t vqadd_s64 (int64x1_t a, int64x1_t b)
  //   A32: VQADD.S64 Dd, Dn, Dm
  //   A64: SQADD Dd, Dn, Dm
  public static Vector64<long> AddSaturateScalar(Vector64<long> left, Vector64<long> right);

  // uint64x1_t vqadd_u64 (uint64x1_t a, uint64x1_t b)
  //   A32: VQADD.U64 Dd, Dn, Dm
  //   A64: UQADD Dd, Dn, Dm
  public static Vector64<ulong> AddSaturateScalar(Vector64<ulong> left, Vector64<ulong> right);

  // uint16x8_t vaddl_u8 (uint8x8_t a, uint8x8_t b)
  //   A32: VADDL.U8 Qd, Dn, Dm
  //   A64: UADDL Vd.8H, Vn.8B, Vm.8B
  public static Vector128<ushort> AddWideningLower(Vector64<byte> left, Vector64<byte> right);

  // int32x4_t vaddl_s16 (int16x4_t a, int16x4_t b)
  //   A32: VADDL.S16 Qd, Dn, Dm
  //   A64: SADDL Vd.4S, Vn.4H, Vm.4H
  public static Vector128<int> AddWideningLower(Vector64<short> left, Vector64<short> right);

  // int64x2_t vaddl_s32 (int32x2_t a, int32x2_t b)
  //   A32: VADDL.S32 Qd, Dn, Dm
  //   A64: SADDL Vd.2D, Vn.2S, Vm.2S
  public static Vector128<long> AddWideningLower(Vector64<int> left, Vector64<int> right);

  // int16x8_t vaddl_s8 (int8x8_t a, int8x8_t b)
  //   A32: VADDL.S8 Qd, Dn, Dm
  //   A64: SADDL Vd.8H, Vn.8B, Vm.8B
  public static Vector128<short> AddWideningLower(Vector64<sbyte> left, Vector64<sbyte> right);

  // uint32x4_t vaddl_u16 (uint16x4_t a, uint16x4_t b)
  //   A32: VADDL.U16 Qd, Dn, Dm
  //   A64: UADDL Vd.4S, Vn.4H, Vm.4H
  public static Vector128<uint> AddWideningLower(Vector64<ushort> left, Vector64<ushort> right);

  // uint64x2_t vaddl_u32 (uint32x2_t a, uint32x2_t b)
  //   A32: VADDL.U32 Qd, Dn, Dm
  //   A64: UADDL Vd.2D, Vn.2S, Vm.2S
  public static Vector128<ulong> AddWideningLower(Vector64<uint> left, Vector64<uint> right);

  // int16x8_t vaddw_s8 (int16x8_t a, int8x8_t b)
  //   A32: VADDW.S8 Qd, Qn, Dm
  //   A64: SADDW Vd.8H, Vn.8H, Vm.8B
  public static Vector128<short> AddWideningLower(Vector128<short> left, Vector64<sbyte> right);

  // int32x4_t vaddw_s16 (int32x4_t a, int16x4_t b)
  //   A32: VADDW.S16 Qd, Qn, Dm
  //   A64: SADDW Vd.4S, Vn.4S, Vm.4H
  public static Vector128<int> AddWideningLower(Vector128<int> left, Vector64<short> right);

  // int64x2_t vaddw_s32 (int64x2_t a, int32x2_t b)
  //   A32: VADDW.S32 Qd, Qn, Dm
  //   A64: SADDW Vd.2D, Vn.2D, Vm.2S
  public static Vector128<long> AddWideningLower(Vector128<long> left, Vector64<int> right);

  // uint16x8_t vaddw_u8 (uint16x8_t a, uint8x8_t b)
  //   A32: VADDW.U8 Qd, Qn, Dm
  //   A64: UADDW Vd.8H, Vn.8H, Vm.8B
  public static Vector128<ushort> AddWideningLower(Vector128<ushort> left, Vector64<byte> right);

  // uint32x4_t vaddw_u16 (uint32x4_t a, uint16x4_t b)
  //   A32: VADDW.U16 Qd, Qn, Dm
  //   A64: UADDW Vd.4S, Vn.4S, Vm.4H
  public static Vector128<uint> AddWideningLower(Vector128<uint> left, Vector64<ushort> right);

  // uint64x2_t vaddw_u32 (uint64x2_t a, uint32x2_t b)
  //   A32: VADDW.U32 Qd, Qn, Dm
  //   A64: UADDW Vd.2D, Vn.2D, Vm.2S
  public static Vector128<ulong> AddWideningLower(Vector128<ulong> left, Vector64<uint> right);

  // uint16x8_t vaddl_high_u8 (uint8x16_t a, uint8x16_t b)
  //   A32: VADDL.U8 Qd, Dn+1, Dm+1
  //   A64: UADDL2 Vd.8H, Vn.16B, Vm.16B
  public static Vector128<ushort> AddWideningUpper(Vector128<byte> left, Vector128<byte> right);

  // int32x4_t vaddl_high_s16 (int16x8_t a, int16x8_t b)
  //   A32: VADDL.S16 Qd, Dn+1, Dm+1
  //   A64: SADDL2 Vd.4S, Vn.8H, Vm.8H
  public static Vector128<int> AddWideningUpper(Vector128<short> left, Vector128<short> right);

  // int16x8_t vaddw_high_s8 (int16x8_t a, int8x16_t b)
  //   A32: VADDW.S8 Qd, Qn, Dm+1
  //   A64: SADDW2 Vd.8H, Vn.8H, Vm.16B
  public static Vector128<short> AddWideningUpper(Vector128<short> left, Vector128<sbyte> right);

  // int32x4_t vaddw_high_s16 (int32x4_t a, int16x8_t b)
  //   A32: VADDW.S16 Qd, Qn, Dm+1
  //   A64: SADDW2 Vd.4S, Vn.4S, Vm.8H
  public static Vector128<int> AddWideningUpper(Vector128<int> left, Vector128<short> right);

  // int64x2_t vaddl_high_s32 (int32x4_t a, int32x4_t b)
  //   A32: VADDL.S32 Qd, Dn+1, Dm+1
  //   A64: SADDL2 Vd.2D, Vn.4S, Vm.4S
  public static Vector128<long> AddWideningUpper(Vector128<int> left, Vector128<int> right);

  // int64x2_t vaddw_s32 (int64x2_t a, int32x2_t b)
  //   A32: VADDW.S32 Qd, Qn, Dm+1
  //   A64: SADDW2 Vd.2D, Vn.2D, Vm.2S
  public static Vector128<long> AddWideningUpper(Vector128<long> left, Vector64<int> right);

  // int16x8_t vaddl_high_s8 (int8x16_t a, int8x16_t b)
  //   A32: VADDL.S8 Qd, Dn+1, Dm+1
  //   A64: SADDL2 Vd.8H, Vn.16B, Vm.16B
  public static Vector128<short> AddWideningUpper(Vector128<sbyte> left, Vector128<sbyte> right);

  // uint16x8_t vaddw_high_u8 (uint16x8_t a, uint8x16_t b)
  //   A32: VADDW.U8 Qd, Qn, Dm+1
  //   A64: UADDW2 Vd.8H, Vn.8H, Vm.16B
  public static Vector128<ushort> AddWideningUpper(Vector128<ushort> left, Vector128<byte> right);

  // uint32x4_t vaddl_high_u16 (uint16x8_t a, uint16x8_t b)
  //   A32: VADDL.U16 Qd, Dn+1, Dm+1
  //   A64: UADDL2 Vd.4S, Vn.8H, Vm.8H
  public static Vector128<uint> AddWideningUpper(Vector128<ushort> left, Vector128<ushort> right);

  // uint32x4_t vaddw_high_u16 (uint32x4_t a, uint16x8_t b)
  //   A32: VADDW.U16 Qd, Qn, Dm+1
  //   A64: UADDW2 Vd.4S, Vn.4S, Vm.8H
  public static Vector128<uint> AddWideningUpper(Vector128<uint> left, Vector128<ushort> right);

  // uint64x2_t vaddl_high_u32 (uint32x4_t a, uint32x4_t b)
  //   A32: VADDL.U32 Qd, Dn+1, Dm+1
  //   A64: UADDL2 Vd.2D, Vn.4S, Vm.4S
  public static Vector128<ulong> AddWideningUpper(Vector128<uint> left, Vector128<uint> right);

  // uint64x2_t vaddw_high_u32 (uint64x2_t a, uint32x4_t b)
  //   A32: VADDW.U32 Qd, Qn, Dm+1
  //   A64: UADDW2 Vd.2D, Vn.2D, Vm.4S
  public static Vector128<ulong> AddWideningUpper(Vector128<ulong> left, Vector128<uint> right);

  // uint8x8_t vhadd_u8 (uint8x8_t a, uint8x8_t b)
  //   A32: VHADD.U8 Dd, Dn, Dm
  //   A64: UHADD Vd.8B, Vn.8B, Vm.8B
  public static Vector64<byte> FusedAddHalving(Vector64<byte> left, Vector64<byte> right);

  // int16x4_t vhadd_s16 (int16x4_t a, int16x4_t b)
  //   A32: VHADD.S16 Dd, Dn, Dm
  //   A64: SHADD Vd.4H, Vn.4H, Vm.4H
  public static Vector64<short> FusedAddHalving(Vector64<short> left, Vector64<short> right);

  // int32x2_t vhadd_s32 (int32x2_t a, int32x2_t b)
  //   A32: VHADD.S32 Dd, Dn, Dm
  //   A64: SHADD Vd.2S, Vn.2S, Vm.2S
  public static Vector64<int> FusedAddHalving(Vector64<int> left, Vector64<int> right);

  // int8x8_t vhadd_s8 (int8x8_t a, int8x8_t b)
  //   A32: VHADD.S8 Dd, Dn, Dm
  //   A64: SHADD Vd.8B, Vn.8B, Vm.8B
  public static Vector64<sbyte> FusedAddHalving(Vector64<sbyte> left, Vector64<sbyte> right);

  // uint16x4_t vhadd_u16 (uint16x4_t a, uint16x4_t b)
  //   A32: VHADD.U16 Dd, Dn, Dm
  //   A64: UHADD Vd.4H, Vn.4H, Vm.4H
  public static Vector64<ushort> FusedAddHalving(Vector64<ushort> left, Vector64<ushort> right);

  // uint32x2_t vhadd_u32 (uint32x2_t a, uint32x2_t b)
  //   A32: VHADD.U32 Dd, Dn, Dm
  //   A64: UHADD Vd.2S, Vn.2S, Vm.2S
  public static Vector64<uint> FusedAddHalving(Vector64<uint> left, Vector64<uint> right);

  // uint8x16_t vhaddq_u8 (uint8x16_t a, uint8x16_t b)
  //   A32: VHADD.U8 Qd, Qn, Qm
  //   A64: UHADD Vd.16B, Vn.16B, Vm.16B
  public static Vector128<byte> FusedAddHalving(Vector128<byte> left, Vector128<byte> right);

  // int16x8_t vhaddq_s16 (int16x8_t a, int16x8_t b)
  //   A32: VHADD.S16 Qd, Qn, Qm
  //   A64: SHADD Vd.8H, Vn.8H, Vm.8H
  public static Vector128<short> FusedAddHalving(Vector128<short> left, Vector128<short> right);

  // int32x4_t vhaddq_s32 (int32x4_t a, int32x4_t b)
  //   A32: VHADD.S32 Qd, Qn, Qm
  //   A64: SHADD Vd.4S, Vn.4S, Vm.4S
  public static Vector128<int> FusedAddHalving(Vector128<int> left, Vector128<int> right);

  // int8x16_t vhaddq_s8 (int8x16_t a, int8x16_t b)
  //   A32: VHADD.S8 Qd, Qn, Qm
  //   A64: SHADD Vd.16B, Vn.16B, Vm.16B
  public static Vector128<sbyte> FusedAddHalving(Vector128<sbyte> left, Vector128<sbyte> right);

  // uint16x8_t vhaddq_u16 (uint16x8_t a, uint16x8_t b)
  //   A32: VHADD.U16 Qd, Qn, Qm
  //   A64: UHADD Vd.8H, Vn.8H, Vm.8H
  public static Vector128<ushort> FusedAddHalving(Vector128<ushort> left, Vector128<ushort> right);

  // uint32x4_t vhaddq_u32 (uint32x4_t a, uint32x4_t b)
  //   A32: VHADD.U32 Qd, Qn, Qm
  //   A64: UHADD Vd.4S, Vn.4S, Vm.4S
  public static Vector128<uint> FusedAddHalving(Vector128<uint> left, Vector128<uint> right);

  // uint8x8_t vrhadd_u8 (uint8x8_t a, uint8x8_t b)
  //   A32: VRHADD.U8 Dd, Dn, Dm
  //   A64: URHADD Vd.8B, Vn.8B, Vm.8B
  public static Vector64<byte> FusedAddRoundedHalving(Vector64<byte> left, Vector64<byte> right);

  // int16x4_t vrhadd_s16 (int16x4_t a, int16x4_t b)
  //   A32: VRHADD.S16 Dd, Dn, Dm
  //   A64: SRHADD Vd.4H, Vn.4H, Vm.4H
  public static Vector64<short> FusedAddRoundedHalving(Vector64<short> left, Vector64<short> right);

  // int32x2_t vrhadd_s32 (int32x2_t a, int32x2_t b)
  //   A32: VRHADD.S32 Dd, Dn, Dm
  //   A64: SRHADD Vd.2S, Vn.2S, Vm.2S
  public static Vector64<int> FusedAddRoundedHalving(Vector64<int> left, Vector64<int> right);

  // int8x8_t vrhadd_s8 (int8x8_t a, int8x8_t b)
  //   A32: VRHADD.S8 Dd, Dn, Dm
  //   A64: SRHADD Vd.8B, Vn.8B, Vm.8B
  public static Vector64<sbyte> FusedAddRoundedHalving(Vector64<sbyte> left, Vector64<sbyte> right);

  // uint16x4_t vrhadd_u16 (uint16x4_t a, uint16x4_t b)
  //   A32: VRHADD.U16 Dd, Dn, Dm
  //   A64: URHADD Vd.4H, Vn.4H, Vm.4H
  public static Vector64<ushort> FusedAddRoundedHalving(Vector64<ushort> left, Vector64<ushort> right);

  // uint32x2_t vrhadd_u32 (uint32x2_t a, uint32x2_t b)
  //   A32: VRHADD.U32 Dd, Dn, Dm
  //   A64: URHADD Vd.2S, Vn.2S, Vm.2S
  public static Vector64<uint> FusedAddRoundedHalving(Vector64<uint> left, Vector64<uint> right);

  // uint8x16_t vrhaddq_u8 (uint8x16_t a, uint8x16_t b)
  //   A32: VRHADD.U8 Qd, Qn, Qm
  //   A64: URHADD Vd.16B, Vn.16B, Vm.16B
  public static Vector128<byte> FusedAddRoundedHalving(Vector128<byte> left, Vector128<byte> right);

  // int16x8_t vrhaddq_s16 (int16x8_t a, int16x8_t b)
  //   A32: VRHADD.S16 Qd, Qn, Qm
  //   A64: SRHADD Vd.8H, Vn.8H, Vm.8H
  public static Vector128<short> FusedAddRoundedHalving(Vector128<short> left, Vector128<short> right);

  // int32x4_t vrhaddq_s32 (int32x4_t a, int32x4_t b)
  //   A32: VRHADD.S32 Qd, Qn, Qm
  //   A64: SRHADD Vd.4S, Vn.4S, Vm.4S
  public static Vector128<int> FusedAddRoundedHalving(Vector128<int> left, Vector128<int> right);

  // int8x16_t vrhaddq_s8 (int8x16_t a, int8x16_t b)
  //   A32: VRHADD.S8 Qd, Qn, Qm
  //   A64: SRHADD Vd.16B, Vn.16B, Vm.16B
  public static Vector128<sbyte> FusedAddRoundedHalving(Vector128<sbyte> left, Vector128<sbyte> right);

  // uint16x8_t vrhaddq_u16 (uint16x8_t a, uint16x8_t b)
  //   A32: VRHADD.U16 Qd, Qn, Qm
  //   A64: URHADD Vd.8H, Vn.8H, Vm.8H
  public static Vector128<ushort> FusedAddRoundedHalving(Vector128<ushort> left, Vector128<ushort> right);

  // uint32x4_t vrhaddq_u32 (uint32x4_t a, uint32x4_t b)
  //   A32: VRHADD.U32 Qd, Qn, Qm
  //   A64: URHADD Vd.4S, Vn.4S, Vm.4S
  public static Vector128<uint> FusedAddRoundedHalving(Vector128<uint> left, Vector128<uint> right);

  // uint8x8_t vhsub_u8 (uint8x8_t a, uint8x8_t b)
  //   A32: VHSUB.U8 Dd, Dn, Dm
  //   A64: UHSUB Vd.8B, Vn.8B, Vm.8B
  public static Vector64<byte> FusedSubtractHalving(Vector64<byte> left, Vector64<byte> right);

  // int16x4_t vhsub_s16 (int16x4_t a, int16x4_t b)
  //   A32: VHSUB.S16 Dd, Dn, Dm
  //   A64: SHSUB Vd.4H, Vn.4H, Vm.4H
  public static Vector64<short> FusedSubtractHalving(Vector64<short> left, Vector64<short> right);

  // int32x2_t vhsub_s32 (int32x2_t a, int32x2_t b)
  //   A32: VHSUB.S32 Dd, Dn, Dm
  //   A64: SHSUB Vd.2S, Vn.2S, Vm.2S
  public static Vector64<int> FusedSubtractHalving(Vector64<int> left, Vector64<int> right);

  // int8x8_t vhsub_s8 (int8x8_t a, int8x8_t b)
  //   A32: VHSUB.S8 Dd, Dn, Dm
  //   A64: SHSUB Vd.8B, Vn.8B, Vm.8B
  public static Vector64<sbyte> FusedSubtractHalving(Vector64<sbyte> left, Vector64<sbyte> right);

  // uint16x4_t vhsub_u16 (uint16x4_t a, uint16x4_t b)
  //   A32: VHSUB.U16 Dd, Dn, Dm
  //   A64: UHSUB Vd.4H, Vn.4H, Vm.4H
  public static Vector64<ushort> FusedSubtractHalving(Vector64<ushort> left, Vector64<ushort> right);

  // uint32x2_t vhsub_u32 (uint32x2_t a, uint32x2_t b)
  //   A32: VHSUB.U32 Dd, Dn, Dm
  //   A64: UHSUB Vd.2S, Vn.2S, Vm.2S
  public static Vector64<uint> FusedSubtractHalving(Vector64<uint> left, Vector64<uint> right);

  // uint8x16_t vhsubq_u8 (uint8x16_t a, uint8x16_t b)
  //   A32: VHSUB.U8 Qd, Qn, Qm
  //   A64: UHSUB Vd.16B, Vn.16B, Vm.16B
  public static Vector128<byte> FusedSubtractHalving(Vector128<byte> left, Vector128<byte> right);

  // int16x8_t vhsubq_s16 (int16x8_t a, int16x8_t b)
  //   A32: VHSUB.S16 Qd, Qn, Qm
  //   A64: SHSUB Vd.8H, Vn.8H, Vm.8H
  public static Vector128<short> FusedSubtractHalving(Vector128<short> left, Vector128<short> right);

  // int32x4_t vhsubq_s32 (int32x4_t a, int32x4_t b)
  //   A32: VHSUB.S32 Qd, Qn, Qm
  //   A64: SHSUB Vd.4S, Vn.4S, Vm.4S
  public static Vector128<int> FusedSubtractHalving(Vector128<int> left, Vector128<int> right);

  // int8x16_t vhsubq_s8 (int8x16_t a, int8x16_t b)
  //   A32: VHSUB.S8 Qd, Qn, Qm
  //   A64: SHSUB Vd.16B, Vn.16B, Vm.16B
  public static Vector128<sbyte> FusedSubtractHalving(Vector128<sbyte> left, Vector128<sbyte> right);

  // uint16x8_t vhsubq_u16 (uint16x8_t a, uint16x8_t b)
  //   A32: VHSUB.U16 Qd, Qn, Qm
  //   A64: UHSUB Vd.8H, Vn.8H, Vm.8H
  public static Vector128<ushort> FusedSubtractHalving(Vector128<ushort> left, Vector128<ushort> right);

  // uint32x4_t vhsubq_u32 (uint32x4_t a, uint32x4_t b)
  //   A32: VHSUB.U32 Qd, Qn, Qm
  //   A64: UHSUB Vd.4S, Vn.4S, Vm.4S
  public static Vector128<uint> FusedSubtractHalving(Vector128<uint> left, Vector128<uint> right);

  // uint16x8_t vmull_u8 (uint8x8_t a, uint8x8_t b)
  //   A32: VMULL.U8 Qd, Dn, Dm
  //   A64: UMULL Vd.8H, Vn.8B, Vm.8B
  public static Vector128<ushort> MultiplyWideningLower(Vector64<byte> left, Vector64<byte> right);

  // int32x4_t vmull_s16 (int16x4_t a, int16x4_t b)
  //   A32: VMULL.S16 Qd, Dn, Dm
  //   A64: SMULL Vd.4S, Vn.4H, Vm.4H
  public static Vector128<int> MultiplyWideningLower(Vector64<short> left, Vector64<short> right);

  // int64x2_t vmull_s32 (int32x2_t a, int32x2_t b)
  //   A32: VMULL.S32 Qd, Dn, Dm
  //   A64: SMULL Vd.2D, Vn.2S, Vm.2S
  public static Vector128<long> MultiplyWideningLower(Vector64<int> left, Vector64<int> right);

  // int16x8_t vmull_s8 (int8x8_t a, int8x8_t b)
  //   A32: VMULL.S8 Qd, Dn, Dm
  //   A64: SMULL Vd.8H, Vn.8B, Vm.8B
  public static Vector128<short> MultiplyWideningLower(Vector64<sbyte> left, Vector64<sbyte> right);

  // uint32x4_t vmull_u16 (uint16x4_t a, uint16x4_t b)
  //   A32: VMULL.U16 Qd, Dn, Dm
  //   A64: UMULL Vd.4S, Vn.4H, Vm.4H
  public static Vector128<uint> MultiplyWideningLower(Vector64<ushort> left, Vector64<ushort> right);

  // uint64x2_t vmull_u32 (uint32x2_t a, uint32x2_t b)
  //   A32: VMULL.U32 Qd, Dn, Dm
  //   A64: UMULL Vd.2D, Vn.2S, Vm.2S
  public static Vector128<ulong> MultiplyWideningLower(Vector64<uint> left, Vector64<uint> right);

  // int16x8_t vmlal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
  //   A32: VMLAL.S8 Qd, Dn, Dm
  //   A64: SMLAL Vd.8H, Vn.8B, Vm.8B
  public static Vector128<short> MultiplyWideningLowerAndAdd(Vector128<short> addend, Vector64<sbyte> left, Vector64<sbyte> right);

  // int32x4_t vmlal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
  //   A32: VMLAL.S16 Qd, Dn, Dm
  //   A64: SMLAL Vd.4S, Vn.4H, Vm.4H
  public static Vector128<int> MultiplyWideningLowerAndAdd(Vector128<int> addend, Vector64<short> left, Vector64<short> right);

  // int64x2_t vmlal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
  //   A32: VMLAL.S32 Qd, Dn, Dm
  //   A64: SMLAL Vd.2D, Vn.2S, Vm.2S
  public static Vector128<long> MultiplyWideningLowerAndAdd(Vector128<long> addend, Vector64<int> left, Vector64<int> right);

  // uint16x8_t vmlal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
  //   A32: VMLAL.U8 Qd, Dn, Dm
  //   A64: UMLAL Vd.8H, Vn.8B, Vm.8B
  public static Vector128<ushort> MultiplyWideningLowerAndAdd(Vector128<ushort> addend, Vector64<byte> left, Vector64<byte> right);

  // uint32x4_t vmlal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
  //   A32: VMLAL.U16 Qd, Dn, Dm
  //   A64: UMLAL Vd.4S, Vn.4H, Vm.4H
  public static Vector128<uint> MultiplyWideningLowerAndAdd(Vector128<uint> addend, Vector64<ushort> left, Vector64<ushort> right);

  // uint64x2_t vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
  //   A32: VMLAL.U32 Qd, Dn, Dm
  //   A64: UMLAL Vd.2D, Vn.2S, Vm.2S
  public static Vector128<ulong> MultiplyWideningLowerAndAdd(Vector128<ulong> addend, Vector64<uint> left, Vector64<uint> right);

  // int16x8_t vmlsl_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
  //   A32: VMLSL.S8 Qd, Dn, Dm
  //   A64: SMLSL Vd.8H, Vn.8B, Vm.8B
  public static Vector128<short> MultiplyWideningLowerAndSubtract(Vector128<short> minuend, Vector64<sbyte> left, Vector64<sbyte> right);

  // int32x4_t vmlsl_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
  //   A32: VMLSL.S16 Qd, Dn, Dm
  //   A64: SMLSL Vd.4S, Vn.4H, Vm.4H
  public static Vector128<int> MultiplyWideningLowerAndSubtract(Vector128<int> minuend, Vector64<short> left, Vector64<short> right);

  // int64x2_t vmlsl_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
  //   A32: VMLSL.S32 Qd, Dn, Dm
  //   A64: SMLSL Vd.2D, Vn.2S, Vm.2S
  public static Vector128<long> MultiplyWideningLowerAndSubtract(Vector128<long> minuend, Vector64<int> left, Vector64<int> right);

  // uint16x8_t vmlsl_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
  //   A32: VMLSL.U8 Qd, Dn, Dm
  //   A64: UMLSL Vd.8H, Vn.8B, Vm.8B
  public static Vector128<ushort> MultiplyWideningLowerAndSubtract(Vector128<ushort> minuend, Vector64<byte> left, Vector64<byte> right);

  // uint32x4_t vmlsl_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
  //   A32: VMLSL.U16 Qd, Dn, Dm
  //   A64: UMLSL Vd.4S, Vn.4H, Vm.4H
  public static Vector128<uint> MultiplyWideningLowerAndSubtract(Vector128<uint> minuend, Vector64<ushort> left, Vector64<ushort> right);

  // uint64x2_t vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
  //   A32: VMLSL.U32 Qd, Dn, Dm
  //   A64: UMLSL Vd.2D, Vn.2S, Vm.2S
  public static Vector128<ulong> MultiplyWideningLowerAndSubtract(Vector128<ulong> minuend, Vector64<uint> left, Vector64<uint> right);

  // uint16x8_t vmull_high_u8 (uint8x16_t a, uint8x16_t b)
  //   A32: VMULL.U8 Qd, Dn+1, Dm+1
  //   A64: UMULL2 Vd.8H, Vn.16B, Vm.16B
  public static Vector128<ushort> MultiplyWideningUpper(Vector128<byte> left, Vector128<byte> right);

  // int32x4_t vmull_high_s16 (int16x8_t a, int16x8_t b)
  //   A32: VMULL.S16 Qd, Dn+1, Dm+1
  //   A64: SMULL2 Vd.4S, Vn.8H, Vm.8H
  public static Vector128<int> MultiplyWideningUpper(Vector128<short> left, Vector128<short> right);

  // int64x2_t vmull_high_s32 (int32x4_t a, int32x4_t b)
  //   A32: VMULL.S32 Qd, Dn+1, Dm+1
  //   A64: SMULL2 Vd.2D, Vn.4S, Vm.4S
  public static Vector128<long> MultiplyWideningUpper(Vector128<int> left, Vector128<int> right);

  // int16x8_t vmull_high_s8 (int8x16_t a, int8x16_t b)
  //   A32: VMULL.S8 Qd, Dn+1, Dm+1
  //   A64: SMULL2 Vd.8H, Vn.16B, Vm.16B
  public static Vector128<short> MultiplyWideningUpper(Vector128<sbyte> left, Vector128<sbyte> right);

  // uint32x4_t vmull_high_u16 (uint16x8_t a, uint16x8_t b)
  //   A32: VMULL.U16 Qd, Dn+1, Dm+1
  //   A64: UMULL2 Vd.4S, Vn.8H, Vm.8H
  public static Vector128<uint> MultiplyWideningUpper(Vector128<ushort> left, Vector128<ushort> right);

  // uint64x2_t vmull_high_u32 (uint32x4_t a, uint32x4_t b)
  //   A32: VMULL.U32 Qd, Dn+1, Dm+1
  //   A64: UMULL2 Vd.2D, Vn.4S, Vm.4S
  public static Vector128<ulong> MultiplyWideningUpper(Vector128<uint> left, Vector128<uint> right);

  // int16x8_t vmlal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
  //   A32: VMLAL.S8 Qd, Dn+1, Dm+1
  //   A64: SMLAL2 Vd.8H, Vn.16B, Vm.16B
  public static Vector128<short> MultiplyWideningUpperAndAdd(Vector128<short> addend, Vector128<sbyte> left, Vector128<sbyte> right);

  // int32x4_t vmlal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
  //   A32: VMLAL.S16 Qd, Dn+1, Dm+1
  //   A64: SMLAL2 Vd.4S, Vn.8H, Vm.8H
  public static Vector128<int> MultiplyWideningUpperAndAdd(Vector128<int> addend, Vector128<short> left, Vector128<short> right);

  // int64x2_t vmlal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
  //   A32: VMLAL.S32 Qd, Dn+1, Dm+1
  //   A64: SMLAL2 Vd.2D, Vn.4S, Vm.4S
  public static Vector128<long> MultiplyWideningUpperAndAdd(Vector128<long> addend, Vector128<int> left, Vector128<int> right);

  // uint16x8_t vmlal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
  //   A32: VMLAL.U8 Qd, Dn+1, Dm+1
  //   A64: UMLAL2 Vd.8H, Vn.16B, Vm.16B
  public static Vector128<ushort> MultiplyWideningUpperAndAdd(Vector128<ushort> addend, Vector128<byte> left, Vector128<byte> right);

  // uint32x4_t vmlal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
  //   A32: VMLAL.U16 Qd, Dn+1, Dm+1
  //   A64: UMLAL2 Vd.4S, Vn.8H, Vm.8H
  public static Vector128<uint> MultiplyWideningUpperAndAdd(Vector128<uint> addend, Vector128<ushort> left, Vector128<ushort> right);

  // uint64x2_t vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
  //   A32: VMLAL.U32 Qd, Dn+1, Dm+1
  //   A64: UMLAL2 Vd.2D, Vn.4S, Vm.4S
  public static Vector128<ulong> MultiplyWideningUpperAndAdd(Vector128<ulong> addend, Vector128<uint> left, Vector128<uint> right);

  // int16x8_t vmlsl_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
  //   A32: VMLSL.S8 Qd, Dn+1, Dm+1
  //   A64: SMLSL2 Vd.8H, Vn.16B, Vm.16B
  public static Vector128<short> MultiplyWideningUpperAndSubtract(Vector128<short> minuend, Vector128<sbyte> left, Vector128<sbyte> right);

  // int32x4_t vmlsl_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
  //   A32: VMLSL.S16 Qd, Dn+1, Dm+1
  //   A64: SMLSL2 Vd.4S, Vn.8H, Vm.8H
  public static Vector128<int> MultiplyWideningUpperAndSubtract(Vector128<int> minuend, Vector128<short> left, Vector128<short> right);

  // int64x2_t vmlsl_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
  //   A32: VMLSL.S32 Qd, Dn+1, Dm+1
  //   A64: SMLSL2 Vd.2D, Vn.4S, Vm.4S
  public static Vector128<long> MultiplyWideningUpperAndSubtract(Vector128<long> minuend, Vector128<int> left, Vector128<int> right);

  // uint16x8_t vmlsl_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
  //   A32: VMLSL.U8 Qd, Dn+1, Dm+1
  //   A64: UMLSL2 Vd.8H, Vn.16B, Vm.16B
  public static Vector128<ushort> MultiplyWideningUpperAndSubtract(Vector128<ushort> minuend, Vector128<byte> left, Vector128<byte> right);

  // uint32x4_t vmlsl_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
  //   A32: VMLSL.U16 Qd, Dn+1, Dm+1
  //   A64: UMLSL2 Vd.4S, Vn.8H, Vm.8H
  public static Vector128<uint> MultiplyWideningUpperAndSubtract(Vector128<uint> minuend, Vector128<ushort> left, Vector128<ushort> right);

  // uint64x2_t vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
  //   A32: VMLSL.U32 Qd, Dn+1, Dm+1
  //   A64: UMLSL2 Vd.2D, Vn.4S, Vm.4S
  public static Vector128<ulong> MultiplyWideningUpperAndSubtract(Vector128<ulong> minuend, Vector128<uint> left, Vector128<uint> right);

  // int8x8_t vsubhn_s16 (int16x8_t a, int16x8_t b)
  //   A32: VSUBHN.I16 Dd, Qn, Qm
  //   A64: SUBHN Vd.8B, Vn.8H, Vm.8H
  public static Vector64<sbyte> SubtractReturningHighNarrowLower(Vector128<short> left, Vector128<short> right);

  // int16x4_t vsubhn_s32 (int32x4_t a, int32x4_t b)
  //   A32: VSUBHN.I32 Dd, Qn, Qm
  //   A64: SUBHN Vd.4H, Vn.4S, Vm.4S
  public static Vector64<short> SubtractReturningHighNarrowLower(Vector128<int> left, Vector128<int> right);

  // int32x2_t vsubhn_s64 (int64x2_t a, int64x2_t b)
  //   A32: VSUBHN.I64 Dd, Qn, Qm
  //   A64: SUBHN Vd.2S, Vn.2D, Vm.2D
  public static Vector64<int> SubtractReturningHighNarrowLower(Vector128<long> left, Vector128<long> right);

  // uint8x8_t vsubhn_u16 (uint16x8_t a, uint16x8_t b)
  //   A32: VSUBHN.I16 Dd, Qn, Qm
  //   A64: SUBHN Vd.8B, Vn.8H, Vm.8H
  public static Vector64<byte> SubtractReturningHighNarrowLower(Vector128<ushort> left, Vector128<ushort> right);

  // uint16x4_t vsubhn_u32 (uint32x4_t a, uint32x4_t b)
  //   A32: VSUBHN.I32 Dd, Qn, Qm
  //   A64: SUBHN Vd.4H, Vn.4S, Vm.4S
  public static Vector64<ushort> SubtractReturningHighNarrowLower(Vector128<uint> left, Vector128<uint> right);

  // uint32x2_t vsubhn_u64 (uint64x2_t a, uint64x2_t b)
  //   A32: VSUBHN.I64 Dd, Qn, Qm
  //   A64: SUBHN Vd.2S, Vn.2D, Vm.2D
  public static Vector64<uint> SubtractReturningHighNarrowLower(Vector128<ulong> left, Vector128<ulong> right);

  // int8x16_t vsubhn_high_s16 (int8x8_t r, int16x8_t a, int16x8_t b)
  //   A32: VSUBHN.I16 Dd+1, Qn, Qm
  //   A64: SUBHN2 Vd.16B, Vn.8B, Vm.8H
  public static Vector128<sbyte> SubtractReturningHighNarrowUpper(Vector64<sbyte> lower, Vector128<short> left, Vector128<short> right);

  // int16x8_t vsubhn_high_s32 (int16x4_t r, int32x4_t a, int32x4_t b)
  //   A32: VSUBHN.I32 Dd+1, Qn, Qm
  //   A64: SUBHN2 Vd.8H, Vn.4H, Vm.4S
  public static Vector128<short> SubtractReturningHighNarrowUpper(Vector64<short> lower, Vector128<int> left, Vector128<int> right);

  // int32x4_t vsubhn_high_s64 (int32x2_t r, int64x2_t a, int64x2_t b)
  //   A32: VSUBHN.I64 Dd+1, Qn, Qm
  //   A64: SUBHN2 Vd.4S, Vn.2S, Vm.2D
  public static Vector128<int> SubtractReturningHighNarrowUpper(Vector64<int> lower, Vector128<long> left, Vector128<long> right);

  // uint8x16_t vsubhn_high_u16 (uint8x8_t r, uint16x8_t a, uint16x8_t b)
  //   A32: VSUBHN.I16 Dd+1, Qn, Qm
  //   A64: SUBHN2 Vd.16B, Vn.8B, Vm.8H
  public static Vector128<byte> SubtractReturningHighNarrowUpper(Vector64<byte> lower, Vector128<ushort> left, Vector128<ushort> right);

  // uint16x8_t vsubhn_high_u32 (uint16x4_t r, uint32x4_t a, uint32x4_t b)
  //   A32: VSUBHN.I32 Dd+1, Qn, Qm
  //   A64: SUBHN2 Vd.8H, Vn.4H, Vm.4S
  public static Vector128<ushort> SubtractReturningHighNarrowUpper(Vector64<ushort> lower, Vector128<uint> left, Vector128<uint> right);

  // uint32x4_t vsubhn_high_u64 (uint32x2_t r, uint64x2_t a, uint64x2_t b)
  //   A32: VSUBHN.I64 Dd+1, Qn, Qm
  //   A64: SUBHN2 Vd.4S, Vn.2S, Vm.2D
  public static Vector128<uint> SubtractReturningHighNarrowUpper(Vector64<uint> lower, Vector128<ulong> left, Vector128<ulong> right);

  // int8x8_t vrsubhn_s16 (int16x8_t a, int16x8_t b)
  //   A32: VRSUBHN.I16 Dd, Qn, Qm
  //   A64: RSUBHN Vd.8B, Vn.8H, Vm.8H
  public static Vector64<sbyte> SubtractReturningRoundedHighNarrowLower(Vector128<short> left, Vector128<short> right);

  // int16x4_t vrsubhn_s32 (int32x4_t a, int32x4_t b)
  //   A32: VRSUBHN.I32 Dd, Qn, Qm
  //   A64: RSUBHN Vd.4H, Vn.4S, Vm.4S
  public static Vector64<short> SubtractReturningRoundedHighNarrowLower(Vector128<int> left, Vector128<int> right);

  // int32x2_t vrsubhn_s64 (int64x2_t a, int64x2_t b)
  //   A32: VRSUBHN.I64 Dd, Qn, Qm
  //   A64: RSUBHN Vd.2S, Vn.2D, Vm.2D
  public static Vector64<int> SubtractReturningRoundedHighNarrowLower(Vector128<long> left, Vector128<long> right);

  // uint8x8_t vrsubhn_u16 (uint16x8_t a, uint16x8_t b)
  //   A32: VRSUBHN.I16 Dd, Qn, Qm
  //   A64: RSUBHN Vd.8B, Vn.8H, Vm.8H
  public static Vector64<byte> SubtractReturningRoundedHighNarrowLower(Vector128<ushort> left, Vector128<ushort> right);

  // uint16x4_t vrsubhn_u32 (uint32x4_t a, uint32x4_t b)
  //   A32: VRSUBHN.I32 Dd, Qn, Qm
  //   A64: RSUBHN Vd.4H, Vn.4S, Vm.4S
  public static Vector64<ushort> SubtractReturningRoundedHighNarrowLower(Vector128<uint> left, Vector128<uint> right);

  // uint32x2_t vrsubhn_u64 (uint64x2_t a, uint64x2_t b)
  //   A32: VRSUBHN.I64 Dd, Qn, Qm
  //   A64: RSUBHN Vd.2S, Vn.2D, Vm.2D
  public static Vector64<uint> SubtractReturningRoundedHighNarrowLower(Vector128<ulong> left, Vector128<ulong> right);

  // int8x16_t vrsubhn_high_s16 (int8x8_t r, int16x8_t a, int16x8_t b)
  //   A32: VRSUBHN.I16 Dd+1, Qn, Qm
  //   A64: RSUBHN2 Vd.16B, Vn.8B, Vm.8H
  public static Vector128<sbyte> SubtractReturningRoundedHighNarrowUpper(Vector64<sbyte> lower, Vector128<short> left, Vector128<short> right);

  // int16x8_t vrsubhn_high_s32 (int16x4_t r, int32x4_t a, int32x4_t b)
  //   A32: VRSUBHN.I32 Dd+1, Qn, Qm
  //   A64: RSUBHN2 Vd.8H, Vn.4H, Vm.4S
  public static Vector128<short> SubtractReturningRoundedHighNarrowUpper(Vector64<short> lower, Vector128<int> left, Vector128<int> right);

  // int32x4_t vrsubhn_high_s64 (int32x2_t r, int64x2_t a, int64x2_t b)
  //   A32: VRSUBHN.I64 Dd+1, Qn, Qm
  //   A64: RSUBHN2 Vd.4S, Vn.2S, Vm.2D
  public static Vector128<int> SubtractReturningRoundedHighNarrowUpper(Vector64<int> lower, Vector128<long> left, Vector128<long> right);

  // uint8x16_t vrsubhn_high_u16 (uint8x8_t r, uint16x8_t a, uint16x8_t b)
  //   A32: VRSUBHN.I16 Dd+1, Qn, Qm
  //   A64: RSUBHN2 Vd.16B, Vn.8B, Vm.8H
  public static Vector128<byte> SubtractReturningRoundedHighNarrowUpper(Vector64<byte> lower, Vector128<ushort> left, Vector128<ushort> right);

  // uint16x8_t vrsubhn_high_u32 (uint16x4_t r, uint32x4_t a, uint32x4_t b)
  //   A32: VRSUBHN.I32 Dd+1, Qn, Qm
  //   A64: RSUBHN2 Vd.8H, Vn.4H, Vm.4S
  public static Vector128<ushort> SubtractReturningRoundedHighNarrowUpper(Vector64<ushort> lower, Vector128<uint> left, Vector128<uint> right);

  // uint32x4_t vrsubhn_high_u64 (uint32x2_t r, uint64x2_t a, uint64x2_t b)
  //   A32: VRSUBHN.I64 Dd+1, Qn, Qm
  //   A64: RSUBHN2 Vd.4S, Vn.2S, Vm.2D
  public static Vector128<uint> SubtractReturningRoundedHighNarrowUpper(Vector64<uint> lower, Vector128<ulong> left, Vector128<ulong> right);

  // uint8x8_t vqsub_u8 (uint8x8_t a, uint8x8_t b)
  //   A32: VQSUB.U8 Dd, Dn, Dm
  //   A64: UQSUB Vd.8B, Vn.8B, Vm.8B
  public static Vector64<byte> SubtractSaturate(Vector64<byte> left, Vector64<byte> right);

  // int16x4_t vqsub_s16 (int16x4_t a, int16x4_t b)
  //   A32: VQSUB.S16 Dd, Dn, Dm
  //   A64: SQSUB Vd.4H, Vn.4H, Vm.4H
  public static Vector64<short> SubtractSaturate(Vector64<short> left, Vector64<short> right);

  // int32x2_t vqsub_s32 (int32x2_t a, int32x2_t b)
  //   A32: VQSUB.S32 Dd, Dn, Dm
  //   A64: SQSUB Vd.2S, Vn.2S, Vm.2S
  public static Vector64<int> SubtractSaturate(Vector64<int> left, Vector64<int> right);

  // int8x8_t vqsub_s8 (int8x8_t a, int8x8_t b)
  //   A32: VQSUB.S8 Dd, Dn, Dm
  //   A64: SQSUB Vd.8B, Vn.8B, Vm.8B
  public static Vector64<sbyte> SubtractSaturate(Vector64<sbyte> left, Vector64<sbyte> right);

  // uint16x4_t vqsub_u16 (uint16x4_t a, uint16x4_t b)
  //   A32: VQSUB.U16 Dd, Dn, Dm
  //   A64: UQSUB Vd.4H, Vn.4H, Vm.4H
  public static Vector64<ushort> SubtractSaturate(Vector64<ushort> left, Vector64<ushort> right);

  // uint32x2_t vqsub_u32 (uint32x2_t a, uint32x2_t b)
  //   A32: VQSUB.U32 Dd, Dn, Dm
  //   A64: UQSUB Vd.2S, Vn.2S, Vm.2S
  public static Vector64<uint> SubtractSaturate(Vector64<uint> left, Vector64<uint> right);

  // uint8x16_t vqsubq_u8 (uint8x16_t a, uint8x16_t b)
  //   A32: VQSUB.U8 Qd, Qn, Qm
  //   A64: UQSUB Vd.16B, Vn.16B, Vm.16B
  public static Vector128<byte> SubtractSaturate(Vector128<byte> left, Vector128<byte> right);

  // int16x8_t vqsubq_s16 (int16x8_t a, int16x8_t b)
  //   A32: VQSUB.S16 Qd, Qn, Qm
  //   A64: SQSUB Vd.8H, Vn.8H, Vm.8H
  public static Vector128<short> SubtractSaturate(Vector128<short> left, Vector128<short> right);

  // int32x4_t vqsubq_s32 (int32x4_t a, int32x4_t b)
  //   A32: VQSUB.S32 Qd, Qn, Qm
  //   A64: SQSUB Vd.4S, Vn.4S, Vm.4S
  public static Vector128<int> SubtractSaturate(Vector128<int> left, Vector128<int> right);

  // int64x2_t vqsubq_s64 (int64x2_t a, int64x2_t b)
  //   A32: VQSUB.S64 Qd, Qn, Qm
  //   A64: SQSUB Vd.2D, Vn.2D, Vm.2D
  public static Vector128<long> SubtractSaturate(Vector128<long> left, Vector128<long> right);

  // int8x16_t vqsubq_s8 (int8x16_t a, int8x16_t b)
  //   A32: VQSUB.S8 Qd, Qn, Qm
  //   A64: SQSUB Vd.16B, Vn.16B, Vm.16B
  public static Vector128<sbyte> SubtractSaturate(Vector128<sbyte> left, Vector128<sbyte> right);

  // uint16x8_t vqsubq_u16 (uint16x8_t a, uint16x8_t b)
  //   A32: VQSUB.U16 Qd, Qn, Qm
  //   A64: UQSUB Vd.8H, Vn.8H, Vm.8H
  public static Vector128<ushort> SubtractSaturate(Vector128<ushort> left, Vector128<ushort> right);

  // uint32x4_t vqsubq_u32 (uint32x4_t a, uint32x4_t b)
  //   A32: VQSUB.U32 Qd, Qn, Qm
  //   A64: UQSUB Vd.4S, Vn.4S, Vm.4S
  public static Vector128<uint> SubtractSaturate(Vector128<uint> left, Vector128<uint> right);

  // uint64x2_t vqsubq_u64 (uint64x2_t a, uint64x2_t b)
  //   A32: VQSUB.U64 Qd, Qn, Qm
  //   A64: UQSUB Vd.2D, Vn.2D, Vm.2D
  public static Vector128<ulong> SubtractSaturate(Vector128<ulong> left, Vector128<ulong> right);

  // int64x1_t vqsub_s64 (int64x1_t a, int64x1_t b)
  //   A32: VQSUB.S64 Dd, Dn, Dm
  //   A64: SQSUB Dd, Dn, Dm
  public static Vector64<long> SubtractSaturateScalar(Vector64<long> left, Vector64<long> right);

  // uint64x1_t vqsub_u64 (uint64x1_t a, uint64x1_t b)
  //   A32: VQSUB.U64 Dd, Dn, Dm
  //   A64: UQSUB Dd, Dn, Dm
  public static Vector64<ulong> SubtractSaturateScalar(Vector64<ulong> left, Vector64<ulong> right);

  // uint16x8_t vsubl_u8 (uint8x8_t a, uint8x8_t b)
  //   A32: VSUBL.U8 Qd, Dn, Dm
  //   A64: USUBL Vd.8H, Vn.8B, Vm.8B
  public static Vector128<ushort> SubtractWideningLower(Vector64<byte> left, Vector64<byte> right);

  // int32x4_t vsubl_s16 (int16x4_t a, int16x4_t b)
  //   A32: VSUBL.S16 Qd, Dn, Dm
  //   A64: SSUBL Vd.4S, Vn.4H, Vm.4H
  public static Vector128<int> SubtractWideningLower(Vector64<short> left, Vector64<short> right);

  // int64x2_t vsubl_s32 (int32x2_t a, int32x2_t b)
  //   A32: VSUBL.S32 Qd, Dn, Dm
  //   A64: SSUBL Vd.2D, Vn.2S, Vm.2S
  public static Vector128<long> SubtractWideningLower(Vector64<int> left, Vector64<int> right);

  // int16x8_t vsubl_s8 (int8x8_t a, int8x8_t b)
  //   A32: VSUBL.S8 Qd, Dn, Dm
  //   A64: SSUBL Vd.8H, Vn.8B, Vm.8B
  public static Vector128<short> SubtractWideningLower(Vector64<sbyte> left, Vector64<sbyte> right);

  // uint32x4_t vsubl_u16 (uint16x4_t a, uint16x4_t b)
  //   A32: VSUBL.U16 Qd, Dn, Dm
  //   A64: USUBL Vd.4S, Vn.4H, Vm.4H
  public static Vector128<uint> SubtractWideningLower(Vector64<ushort> left, Vector64<ushort> right);

  // uint64x2_t vsubl_u32 (uint32x2_t a, uint32x2_t b)
  //   A32: VSUBL.U32 Qd, Dn, Dm
  //   A64: USUBL Vd.2D, Vn.2S, Vm.2S
  public static Vector128<ulong> SubtractWideningLower(Vector64<uint> left, Vector64<uint> right);

  // int16x8_t vsubw_s8 (int16x8_t a, int8x8_t b)
  //   A32: VSUBW.S8 Qd, Qn, Dm
  //   A64: SSUBW Vd.8H, Vn.8H, Vm.8B
  public static Vector128<short> SubtractWideningLower(Vector128<short> left, Vector64<sbyte> right);

  // int32x4_t vsubw_s16 (int32x4_t a, int16x4_t b)
  //   A32: VSUBW.S16 Qd, Qn, Dm
  //   A64: SSUBW Vd.4S, Vn.4S, Vm.4H
  public static Vector128<int> SubtractWideningLower(Vector128<int> left, Vector64<short> right);

  // int64x2_t vsubw_s32 (int64x2_t a, int32x2_t b)
  //   A32: VSUBW.S32 Qd, Qn, Dm
  //   A64: SSUBW Vd.2D, Vn.2D, Vm.2S
  public static Vector128<long> SubtractWideningLower(Vector128<long> left, Vector64<int> right);

  // uint16x8_t vsubw_u8 (uint16x8_t a, uint8x8_t b)
  //   A32: VSUBW.U8 Qd, Qn, Dm
  //   A64: USUBW Vd.8H, Vn.8H, Vm.8B
  public static Vector128<ushort> SubtractWideningLower(Vector128<ushort> left, Vector64<byte> right);

  // uint32x4_t vsubw_u16 (uint32x4_t a, uint16x4_t b)
  //   A32: VSUBW.U16 Qd, Qn, Dm
  //   A64: USUBW Vd.4S, Vn.4S, Vm.4H
  public static Vector128<uint> SubtractWideningLower(Vector128<uint> left, Vector64<ushort> right);

  // uint64x2_t vsubw_u32 (uint64x2_t a, uint32x2_t b)
  //   A32: VSUBW.U32 Qd, Qn, Dm
  //   A64: USUBW Vd.2D, Vn.2D, Vm.2S
  public static Vector128<ulong> SubtractWideningLower(Vector128<ulong> left, Vector64<uint> right);

  // uint16x8_t vsubl_high_u8 (uint8x16_t a, uint8x16_t b)
  //   A32: VSUBL.U8 Qd, Dn+1, Dm+1
  //   A64: USUBL2 Vd.8H, Vn.16B, Vm.16B
  public static Vector128<ushort> SubtractWideningUpper(Vector128<byte> left, Vector128<byte> right);

  // int32x4_t vsubl_high_s16 (int16x8_t a, int16x8_t b)
  //   A32: VSUBL.S16 Qd, Dn+1, Dm+1
  //   A64: SSUBL2 Vd.4S, Vn.8H, Vm.8H
  public static Vector128<int> SubtractWideningUpper(Vector128<short> left, Vector128<short> right);

  // int16x8_t vsubw_high_s8 (int16x8_t a, int8x16_t b)
  //   A32: VSUBW.S8 Qd, Qn, Dm+1
  //   A64: SSUBW2 Vd.8H, Vn.8H, Vm.16B
  public static Vector128<short> SubtractWideningUpper(Vector128<short> left, Vector128<sbyte> right);

  // int32x4_t vsubw_high_s16 (int32x4_t a, int16x8_t b)
  //   A32: VSUBW.S16 Qd, Qn, Dm+1
  //   A64: SSUBW2 Vd.4S, Vn.4S, Vm.8H
  public static Vector128<int> SubtractWideningUpper(Vector128<int> left, Vector128<short> right);

  // int64x2_t vsubl_high_s32 (int32x4_t a, int32x4_t b)
  //   A32: VSUBL.S32 Qd, Dn+1, Dm+1
  //   A64: SSUBL2 Vd.2D, Vn.4S, Vm.4S
  public static Vector128<long> SubtractWideningUpper(Vector128<int> left, Vector128<int> right);

  // int64x2_t vsubw_s32 (int64x2_t a, int32x2_t b)
  //   A32: VSUBW.S32 Qd, Qn, Dm+1
  //   A64: SSUBW2 Vd.2D, Vn.2D, Vm.2S
  public static Vector128<long> SubtractWideningUpper(Vector128<long> left, Vector64<int> right);

  // int16x8_t vsubl_high_s8 (int8x16_t a, int8x16_t b)
  //   A32: VSUBL.S8 Qd, Dn+1, Dm+1
  //   A64: SSUBL2 Vd.8H, Vn.16B, Vm.16B
  public static Vector128<short> SubtractWideningUpper(Vector128<sbyte> left, Vector128<sbyte> right);

  // uint16x8_t vsubw_high_u8 (uint16x8_t a, uint8x16_t b)
  //   A32: VSUBW.U8 Qd, Qn, Dm+1
  //   A64: USUBW2 Vd.8H, Vn.8H, Vm.16B
  public static Vector128<ushort> SubtractWideningUpper(Vector128<ushort> left, Vector128<byte> right);

  // uint32x4_t vsubl_high_u16 (uint16x8_t a, uint16x8_t b)
  //   A32: VSUBL.U16 Qd, Dn+1, Dm+1
  //   A64: USUBL2 Vd.4S, Vn.8H, Vm.8H
  public static Vector128<uint> SubtractWideningUpper(Vector128<ushort> left, Vector128<ushort> right);

  // uint32x4_t vsubw_high_u16 (uint32x4_t a, uint16x8_t b)
  //   A32: VSUBW.U16 Qd, Qn, Dm+1
  //   A64: USUBW2 Vd.4S, Vn.4S, Vm.8H
  public static Vector128<uint> SubtractWideningUpper(Vector128<uint> left, Vector128<ushort> right);

  // uint64x2_t vsubl_high_u32 (uint32x4_t a, uint32x4_t b)
  //   A32: VSUBL.U32 Qd, Dn+1, Dm+1
  //   A64: USUBL2 Vd.2D, Vn.4S, Vm.4S
  public static Vector128<ulong> SubtractWideningUpper(Vector128<uint> left, Vector128<uint> right);

  // uint64x2_t vsubw_high_u32 (uint64x2_t a, uint32x4_t b)
  //   A32: VSUBW.U32 Qd, Qn, Dm+1
  //   A64: USUBW2 Vd.2D, Vn.2D, Vm.4S
  public static Vector128<ulong> SubtractWideningUpper(Vector128<ulong> left, Vector128<uint> right);
}

@TamarChristinaArm Can you please take a look and verify my assumptions how *Upper intrinsics should be implemented on A32?

@TamarChristinaArm
Copy link
Contributor

@TamarChristinaArm Can you please take a look and verify my assumptions how *Upper intrinsics should be implemented on A32?

@echesakovMSFT Yeah those look correct to me. I'll make sure they're also corrected in the next ACLE release.

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
api-approved API was approved in API review, it can be implemented arch-arm64 area-System.Runtime.Intrinsics
Projects
None yet
Development

Successfully merging a pull request may close this issue.

5 participants