From ba04808ed4090143747f56f636b4d3854295faf8 Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Mon, 24 Jun 2024 15:12:23 +0200
Subject: [PATCH 1/3] integrate SIMD Unicode, attempt 2

---
 .../Text/Unicode/Utf8Utility.Validation.cs    | 1036 +++++++++++++++++
 1 file changed, 1036 insertions(+)
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
index a542dad72b5c3..c1b0fc942788f 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -41,6 +41,33 @@ internal static unsafe partial class Utf8Utility
                 return pInputBuffer;
             }
 
+            if (BitConverter.IsLittleEndian && inputLength > 128)
+            {
+                if (AdvSimd.Arm64.IsSupported)
+                {
+                    return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
+                }
+                if (Vector512.IsHardwareAccelerated && Avx512Vbmi.IsSupported)
+                {
+                    return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
+                }
+                if (Avx2.IsSupported)
+                {
+                    return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
+                }
+                if (Sse41.IsSupported)
+                {
+                    return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
+                }
+            }
+            return GetPointerToFirstInvalidByteFallback(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
+        }
+
+        private static byte* GetPointerToFirstInvalidByteFallback(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
+        {
+            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
+            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
+
 #if DEBUG
             // Keep these around for final validation at the end of the method.
             byte* pOriginalInputBuffer = pInputBuffer;
@@ -753,5 +780,1014 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
             extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
             return extractedBits.AsUInt64().ToScalar();
         }
+
+
+        // SimdUnicode:
+
+
+        // We scan the input from buf to len, possibly going back howFarBack bytes, to find the end of
+        // a valid UTF-8 sequence. We return buf + len if the buffer is valid, otherwise we return the
+        // pointer to the first invalid byte.
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe byte* SimpleRewindAndValidateWithErrors(int howFarBack, byte* buf, int len)
+        {
+            var extraLen = 0;
+            var foundLeadingBytes = false;
+
+            for (var i = 0; i <= howFarBack; i++)
+            {
+                var candidateByte = buf[0 - i];
+                foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
+
+                if (foundLeadingBytes)
+                {
+                    buf -= i;
+                    extraLen = i;
+                    break;
+                }
+            }
+
+            if (!foundLeadingBytes) return buf - howFarBack;
+            var pos = 0;
+            int nextPos;
+            uint codePoint = 0;
+
+            len += extraLen;
+
+            while (pos < len)
+            {
+                var firstByte = buf[pos];
+
+                while (firstByte < 0b10000000)
+                {
+                    if (++pos == len) return buf + len;
+                    firstByte = buf[pos];
+                }
+
+                if ((firstByte & 0b11100000) == 0b11000000)
+                {
+                    nextPos = pos + 2;
+                    if (nextPos > len) return buf + pos; // Too short
+
+                    if ((buf[pos + 1] & 0b11000000) != 0b10000000) return buf + pos; // Too short
+
+                    // range check
+                    codePoint = ((uint)(firstByte & 0b00011111) << 6) | (uint)(buf[pos + 1] & 0b00111111);
+                    if (codePoint < 0x80 || 0x7ff < codePoint) return buf + pos; // Overlong
+                }
+                else if ((firstByte & 0b11110000) == 0b11100000)
+                {
+                    nextPos = pos + 3;
+                    if (nextPos > len) return buf + pos; // Too short
+
+                    // range check
+                    codePoint = ((uint)(firstByte & 0b00001111) << 12) |
+                                ((uint)(buf[pos + 1] & 0b00111111) << 6) |
+                                (uint)(buf[pos + 2] & 0b00111111);
+                    // Either overlong or too large:
+                    if (codePoint < 0x800 || 0xffff < codePoint ||
+                        (0xd7ff < codePoint && codePoint < 0xe000))
+                        return buf + pos;
+                    if ((buf[pos + 1] & 0b11000000) != 0b10000000) return buf + pos; // Too short
+
+                    if ((buf[pos + 2] & 0b11000000) != 0b10000000) return buf + pos; // Too short
+                }
+                else if ((firstByte & 0b11111000) == 0b11110000)
+                {
+                    nextPos = pos + 4;
+                    if (nextPos > len) return buf + pos;
+                    if ((buf[pos + 1] & 0b11000000) != 0b10000000) return buf + pos;
+                    if ((buf[pos + 2] & 0b11000000) != 0b10000000) return buf + pos;
+                    if ((buf[pos + 3] & 0b11000000) != 0b10000000) return buf + pos;
+                    // range check
+                    codePoint =
+                        ((uint)(firstByte & 0b00000111) << 18) | ((uint)(buf[pos + 1] & 0b00111111) << 12) |
+                        ((uint)(buf[pos + 2] & 0b00111111) << 6) | (uint)(buf[pos + 3] & 0b00111111);
+                    if (codePoint <= 0xffff || 0x10ffff < codePoint) return buf + pos;
+                }
+                else
+                {
+                    // we may have a continuation/too long error
+                    return buf + pos;
+                }
+
+                pos = nextPos;
+            }
+
+            return buf + len; // no error
+        }
+
+        private const byte TOO_SHORT = 1 << 0;
+        private const byte TOO_LONG = 1 << 1;
+        private const byte OVERLONG_3 = 1 << 2;
+        private const byte SURROGATE = 1 << 4;
+        private const byte OVERLONG_2 = 1 << 5;
+        private const byte TWO_CONTS = 1 << 7;
+        private const byte TOO_LARGE = 1 << 3;
+        private const byte TOO_LARGE_1000 = 1 << 6;
+        private const byte OVERLONG_4 = 1 << 6;
+        private const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS;
+
+        private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int n4, int contbytes)
+        {
+            var n3 = -2 * n4 + 2 * contbytes;
+            var n2 = n4 - 3 * contbytes;
+            var utfadjust = -2 * n4 - 2 * n3 - n2;
+            var scalaradjust = -n4;
+
+            return (utfadjust, scalaradjust);
+        }
+
+        [CompExactlyDependsOn(typeof(Sse41))]
+        public static byte* GetPointerToFirstInvalidByteSse(byte* pInputBuffer, int inputLength,
+            out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
+        {
+            var processedLength = 0;
+            Debug.Assert(inputLength > 128);
+            {
+                if (processedLength + 16 < inputLength)
+                {
+                    var prevInputBlock = Vector128<byte>.Zero;
+                    var maxValue = Vector128.Create(
+                        255, 255, 255, 255, 255, 255, 255, 255,
+                        255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1);
+                    var prevIncomplete = Sse2.SubtractSaturate(prevInputBlock, maxValue);
+                    var shuf1 = Vector128.Create(
+                        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+                        TOO_SHORT | OVERLONG_2,
+                        TOO_SHORT,
+                        TOO_SHORT | OVERLONG_3 | SURROGATE,
+                        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+                    var shuf2 = Vector128.Create(
+                        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+                        CARRY | OVERLONG_2,
+                        CARRY,
+                        CARRY,
+                        CARRY | TOO_LARGE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000);
+                    var shuf3 = Vector128.Create(
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+
+                    var thirdByte = Vector128.Create((byte)(0b11100000u - 0x80));
+                    var fourthByte = Vector128.Create((byte)(0b11110000u - 0x80));
+                    var v0f = Vector128.Create((byte)0x0F);
+                    var v80 = Vector128.Create((byte)0x80);
+                    /****
+                     * So we want to count the number of 4-byte sequences,
+                     * the number of 4-byte sequences, 3-byte sequences, and
+                     * the number of 2-byte sequences.
+                     * We can do it indirectly. We know how many bytes in total
+                     * we have (length). Let us assume that the length covers
+                     * only complete sequences (we need to adjust otherwise).
+                     * We have that
+                     *   length = 4 * n4 + 3 * n3 + 2 * n2 + n1
+                     * where n1 is the number of 1-byte sequences (ASCII),
+                     * n2 is the number of 2-byte sequences, n3 is the number
+                     * of 3-byte sequences, and n4 is the number of 4-byte sequences.
+                     *
+                     * Let ncon be the number of continuation bytes, then we have
+                     *  length =  n4 + n3 + n2 + ncon + n1
+                     *
+                     * We can solve for n2 and n3 in terms of the other variables:
+                     * n3 = n1 - 2 * n4 + 2 * ncon - length
+                     * n2 = -2 * n1 + n4 - 4 * ncon + 2 * length
+                     * Thus we only need to count the number of continuation bytes,
+                     * the number of ASCII bytes and the number of 4-byte sequences.
+                     * But we need even less because we compute
+                     * utfadjust = -2 * n4 - 2 * n3 - n2
+                     * so n1 and length cancel out in the end. Thus we only need to compute
+                     * n3' =  - 2 * n4 + 2 * ncon
+                     * n2' = n4 - 4 * ncon
+                     */
+                    ////////////
+                    // The *block* here is what begins at processedLength and ends
+                    // at processedLength/16*16 or when an error occurs.
+                    ///////////
+
+                    // The block goes from processedLength to processedLength/16*16.
+                    var contbytes = 0; // number of continuation bytes in the block
+                    var n4 = 0; // number of 4-byte sequences that start in this block
+                    for (; processedLength + 16 <= inputLength; processedLength += 16)
+                    {
+                        var currentBlock = Vector128.Load(pInputBuffer + processedLength);
+                        var mask = Sse2.MoveMask(currentBlock);
+                        if (mask == 0)
+                        {
+                            // We have an ASCII block, no need to process it, but
+                            // we need to check if the previous block was incomplete.
+                            //
+                            if (!Sse41.TestZ(prevIncomplete, prevIncomplete))
+                            {
+                                var invalidBytePointer = SimpleRewindAndValidateWithErrors(16 - 3,
+                                    pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+                                // So the code is correct up to invalidBytePointer
+                                if (invalidBytePointer < pInputBuffer + processedLength)
+                                    RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4,
+                                        ref contbytes);
+                                else
+                                    AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) =
+                                    CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                                return invalidBytePointer;
+                            }
+
+                            prevIncomplete = Vector128<byte>.Zero;
+
+                            // Often, we have a lot of ASCII characters in a row.
+                            var localasciirun = 16;
+                            if (processedLength + localasciirun + 64 <= inputLength)
+                            {
+                                for (; processedLength + localasciirun + 64 <= inputLength; localasciirun += 64)
+                                {
+                                    var block1 = Vector128.Load(pInputBuffer + processedLength + localasciirun);
+                                    var block2 = Vector128.Load(pInputBuffer + processedLength + localasciirun + 16);
+                                    var block3 = Vector128.Load(pInputBuffer + processedLength + localasciirun + 32);
+                                    var block4 = Vector128.Load(pInputBuffer + processedLength + localasciirun + 48);
+
+                                    var or = Vector128.BitwiseOr(Vector128.BitwiseOr(block1, block2), Vector128.BitwiseOr(block3, block4));
+                                    if (Sse2.MoveMask(or) != 0) break;
+                                }
+
+                                processedLength += localasciirun - 16;
+                            }
+                        }
+                        else // Contains non-ASCII characters, we need to do non-trivial processing
+                        {
+                            // Use SubtractSaturate to effectively compare if bytes in block are greater than markers.
+                            // Contains non-ASCII characters, we need to do non-trivial processing
+                            var prev1 = Ssse3.AlignRight(currentBlock, prevInputBlock, 16 - 1);
+                            var byte_1_high = Ssse3.Shuffle(shuf1,
+                                Sse2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
+                            var byte_1_low = Ssse3.Shuffle(shuf2, prev1 & v0f);
+                            var byte_2_high = Ssse3.Shuffle(shuf3,
+                                Sse2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
+                            var sc = Vector128.BitwiseAnd(Vector128.BitwiseAnd(byte_1_high, byte_1_low), byte_2_high);
+                            var prev2 = Ssse3.AlignRight(currentBlock, prevInputBlock, 16 - 2);
+                            var prev3 = Ssse3.AlignRight(currentBlock, prevInputBlock, 16 - 3);
+                            prevInputBlock = currentBlock;
+
+                            var isThirdByte = Sse2.SubtractSaturate(prev2, thirdByte);
+                            var isFourthByte = Sse2.SubtractSaturate(prev3, fourthByte);
+                            var must23 = Vector128.BitwiseOr(isThirdByte, isFourthByte);
+                            var must23As80 = Vector128.BitwiseAnd(must23, v80);
+                            var error = Vector128.Xor(must23As80, sc);
+
+                            if (!Sse41.TestZ(error, error))
+                            {
+                                byte* invalidBytePointer;
+                                if (processedLength == 0)
+                                    invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength);
+                                else
+                                    invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+                                if (invalidBytePointer < pInputBuffer + processedLength)
+                                    RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref contbytes);
+                                else
+                                    AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                                return invalidBytePointer;
+                            }
+
+                            prevIncomplete = Sse2.SubtractSaturate(currentBlock, maxValue);
+
+                            contbytes += (int)BitOperations.PopCount((uint)Sse2.MoveMask(byte_2_high));
+                            // We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation.
+                            n4 += (int)BitOperations.PopCount(
+                                (uint)Sse2.MoveMask(Sse2.SubtractSaturate(currentBlock, fourthByte)));
+                        }
+                    }
+
+                    // We may still have an error.
+                    var hasIncompete = !Sse41.TestZ(prevIncomplete, prevIncomplete);
+                    if (processedLength < inputLength || hasIncompete)
+                    {
+                        byte* invalidBytePointer;
+                        if (processedLength == 0 || !hasIncompete)
+                            invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength,
+                                inputLength - processedLength);
+                        else
+                            invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3,
+                                inputLength - processedLength + 3);
+                        if (invalidBytePointer != pInputBuffer + inputLength)
+                        {
+                            if (invalidBytePointer < pInputBuffer + processedLength)
+                                RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref contbytes);
+                            else
+                                AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                            (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                            return invalidBytePointer;
+                        }
+
+                        AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                    }
+
+                    (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                    return pInputBuffer + inputLength;
+                }
+            }
+
+            return GetPointerToFirstInvalidByteFallback(pInputBuffer + processedLength, inputLength - processedLength,
+                out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
+        }
+
+        [CompExactlyDependsOn(typeof(Avx2))]
+        public static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,
+            out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
+        {
+            var processedLength = 0;
+            Debug.Assert(inputLength > 128);
+            {
+                if (processedLength + 32 < inputLength)
+                {
+                    // We still have work to do!
+                    var prevInputBlock = Vector256<byte>.Zero;
+                    var maxValue = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255,
+                        255, 255, 255, 255, 255, 255, 255, 255,
+                        255, 255, 255, 255, 255, 255, 255, 255,
+                        255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1);
+                    var prevIncomplete = Avx2.SubtractSaturate(prevInputBlock, maxValue);
+                    var shuf1 = Vector256.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+                        TOO_SHORT | OVERLONG_2,
+                        TOO_SHORT,
+                        TOO_SHORT | OVERLONG_3 | SURROGATE,
+                        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4,
+                        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+                        TOO_SHORT | OVERLONG_2,
+                        TOO_SHORT,
+                        TOO_SHORT | OVERLONG_3 | SURROGATE,
+                        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+
+                    var shuf2 = Vector256.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+                        CARRY | OVERLONG_2,
+                        CARRY,
+                        CARRY,
+                        CARRY | TOO_LARGE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+                        CARRY | OVERLONG_2,
+                        CARRY,
+                        CARRY,
+                        CARRY | TOO_LARGE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000);
+                    var shuf3 = Vector256.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+
+                    var thirdByte = Vector256.Create((byte)(0b11100000u - 0x80));
+                    var fourthByte = Vector256.Create((byte)(0b11110000u - 0x80));
+                    var v0f = Vector256.Create((byte)0x0F);
+                    var v80 = Vector256.Create((byte)0x80);
+
+                    // The block goes from processedLength to processedLength/16*16.
+                    var contbytes = 0; // number of continuation bytes in the block
+                    var n4 = 0; // number of 4-byte sequences that start in this block
+                    for (; processedLength + 32 <= inputLength; processedLength += 32)
+                    {
+                        var currentBlock = Vector256.Load(pInputBuffer + processedLength);
+                        var mask = Avx2.MoveMask(currentBlock);
+                        if (mask == 0)
+                        {
+                            // We have an ASCII block, no need to process it, but
+                            // we need to check if the previous block was incomplete.
+                            if (!Avx.TestZ(prevIncomplete, prevIncomplete))
+                            {
+                                var invalidBytePointer = SimpleRewindAndValidateWithErrors(32 - 3,
+                                    pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+                                // So the code is correct up to invalidBytePointer
+                                if (invalidBytePointer < pInputBuffer + processedLength)
+                                    RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4,
+                                        ref contbytes);
+                                else
+                                    AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) =
+                                    CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                                return invalidBytePointer;
+                            }
+
+                            prevIncomplete = Vector256<byte>.Zero;
+
+                            // Often, we have a lot of ASCII characters in a row.
+                            var localasciirun = 32;
+                            if (processedLength + localasciirun + 64 <= inputLength)
+                            {
+                                for (; processedLength + localasciirun + 64 <= inputLength; localasciirun += 64)
+                                {
+                                    var block1 = Vector256.Load(pInputBuffer + processedLength + localasciirun);
+                                    var block2 = Vector256.Load(pInputBuffer + processedLength + localasciirun + 32);
+                                    var or = Avx2.Or(block1, block2);
+                                    if (Avx2.MoveMask(or) != 0) break;
+                                }
+                                processedLength += localasciirun - 32;
+                            }
+                        }
+                        else // Contains non-ASCII characters, we need to do non-trivial processing
+                        {
+                            // Use SubtractSaturate to effectively compare if bytes in block are greater than markers.
+                            var shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21);
+                            prevInputBlock = currentBlock;
+                            var prev1 = Avx2.AlignRight(prevInputBlock, shuffled, 16 - 1);
+                            // Vector256.Shuffle vs Avx2.Shuffle
+                            // https://github.com/dotnet/runtime/blob/1400c1e7a888ea1e710e5c08d55c800e0b04bf8a/docs/coding-guidelines/vectorization-guidelines.md#vector256shuffle-vs-avx2shuffle
+                            var byte_1_high = Avx2.Shuffle(shuf1,
+                                Avx2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() &
+                                v0f); // takes the XXXX 0000 part of the previous byte
+                            var byte_1_low =
+                                Avx2.Shuffle(shuf2, prev1 & v0f); // takes the 0000 XXXX part of the previous part
+                            var byte_2_high = Avx2.Shuffle(shuf3,
+                                Avx2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() &
+                                v0f); // takes the XXXX 0000 part of the current byte
+                            var sc = Avx2.And(Avx2.And(byte_1_high, byte_1_low), byte_2_high);
+                            var prev2 = Avx2.AlignRight(prevInputBlock, shuffled, 16 - 2);
+                            var prev3 = Avx2.AlignRight(prevInputBlock, shuffled, 16 - 3);
+                            var isThirdByte = Avx2.SubtractSaturate(prev2, thirdByte);
+                            var isFourthByte = Avx2.SubtractSaturate(prev3, fourthByte);
+                            var must23 = Avx2.Or(isThirdByte, isFourthByte);
+                            var must23As80 = Avx2.And(must23, v80);
+                            var error = Avx2.Xor(must23As80, sc);
+
+                            if (!Avx.TestZ(error, error))
+                            {
+                                byte* invalidBytePointer;
+                                if (processedLength == 0)
+                                    invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength);
+                                else
+                                    invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+                                if (invalidBytePointer < pInputBuffer + processedLength)
+                                    RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref contbytes);
+                                else
+                                    AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                                return invalidBytePointer;
+                            }
+
+                            prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue);
+                            contbytes += (int)BitOperations.PopCount((uint)Avx2.MoveMask(byte_2_high));
+                            // We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation.
+                            n4 += (int)BitOperations.PopCount(
+                                (uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte)));
+                        }
+                    }
+
+                    // We may still have an error.
+                    var hasIncompete = !Avx.TestZ(prevIncomplete, prevIncomplete);
+                    if (processedLength < inputLength || hasIncompete)
+                    {
+                        byte* invalidBytePointer;
+                        if (processedLength == 0 || !hasIncompete)
+                            invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength,
+                                inputLength - processedLength);
+                        else
+                            invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3,
+                                inputLength - processedLength + 3);
+                        if (invalidBytePointer != pInputBuffer + inputLength)
+                        {
+                            if (invalidBytePointer < pInputBuffer + processedLength)
+                                RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref contbytes);
+                            else
+                                AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                            (utf16CodeUnitCountAdjustment, scalarCountAdjustment) =
+                                CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                            return invalidBytePointer;
+                        }
+
+                        AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                    }
+
+                    (utf16CodeUnitCountAdjustment, scalarCountAdjustment) =
+                        CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                    return pInputBuffer + inputLength;
+                }
+            }
+
+            return GetPointerToFirstInvalidByteFallback(pInputBuffer + processedLength, inputLength - processedLength,
+                out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
+        }
+
+
+        [CompExactlyDependsOn(typeof(Avx512Vbmi))]
+        public static unsafe byte* GetPointerToFirstInvalidByteAvx512(byte* pInputBuffer, int inputLength,
+            out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
+        {
+            var processedLength = 0;
+            Debug.Assert(inputLength > 128);
+            {
+                if (processedLength + 64 < inputLength)
+                {
+                    var prevInputBlock = Vector512<byte>.Zero;
+                    var maxValue = Vector512.Create(
+                        255, 255, 255, 255, 255, 255, 255, 255,
+                        255, 255, 255, 255, 255, 255, 255, 255,
+                        255, 255, 255, 255, 255, 255, 255, 255,
+                        255, 255, 255, 255, 255, 255, 255, 255,
+                        255, 255, 255, 255, 255, 255, 255, 255,
+                        255, 255, 255, 255, 255, 255, 255, 255,
+                        255, 255, 255, 255, 255, 255, 255, 255,
+                        255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1);
+                    var prevIncomplete = Avx512BW.SubtractSaturate(prevInputBlock, maxValue);
+                    var shuf1 = Vector512.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+                        TOO_SHORT | OVERLONG_2,
+                        TOO_SHORT,
+                        TOO_SHORT | OVERLONG_3 | SURROGATE,
+                        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4,
+                        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+                        TOO_SHORT | OVERLONG_2,
+                        TOO_SHORT,
+                        TOO_SHORT | OVERLONG_3 | SURROGATE,
+                        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4,
+                        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+                        TOO_SHORT | OVERLONG_2,
+                        TOO_SHORT,
+                        TOO_SHORT | OVERLONG_3 | SURROGATE,
+                        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4,
+                        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+                        TOO_SHORT | OVERLONG_2,
+                        TOO_SHORT,
+                        TOO_SHORT | OVERLONG_3 | SURROGATE,
+                        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+
+                    var shuf2 = Vector512.Create(
+                        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+                        CARRY | OVERLONG_2,
+                        CARRY,
+                        CARRY,
+                        CARRY | TOO_LARGE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+                        CARRY | OVERLONG_2,
+                        CARRY,
+                        CARRY,
+                        CARRY | TOO_LARGE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+                        CARRY | OVERLONG_2,
+                        CARRY,
+                        CARRY,
+                        CARRY | TOO_LARGE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+                        CARRY | OVERLONG_2,
+                        CARRY,
+                        CARRY,
+                        CARRY | TOO_LARGE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000);
+                    var shuf3 = Vector512.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+
+                    var thirdByte = Vector512.Create((byte)(0b11100000u - 0x80));
+                    var fourthByte = Vector512.Create((byte)(0b11110000u - 0x80));
+                    var v0f = Vector512.Create((byte)0x0F);
+                    var v80 = Vector512.Create((byte)0x80);
+
+                    // The block goes from processedLength to processedLength/16*16.
+                    var contbytes = 0; // number of continuation bytes in the block
+                    var n4 = 0; // number of 4-byte sequences that start in this block
+                    for (; processedLength + 64 <= inputLength; processedLength += 64)
+                    {
+                        var currentBlock = Vector512.Load(pInputBuffer + processedLength);
+                        var mask = currentBlock.ExtractMostSignificantBits();
+                        if (mask == 0)
+                        {
+                            // We have an ASCII block, no need to process it, but
+                            // we need to check if the previous block was incomplete.
+                            if (Avx512BW.CompareGreaterThan(prevIncomplete, Vector512<byte>.Zero)
+                                    .ExtractMostSignificantBits() != 0)
+                            {
+                                var invalidBytePointer = SimpleRewindAndValidateWithErrors(16 - 3,
+                                    pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+                                // So the code is correct up to invalidBytePointer
+                                if (invalidBytePointer < pInputBuffer + processedLength)
+                                    RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4,
+                                        ref contbytes);
+                                else
+                                    AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) =
+                                    CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                                return invalidBytePointer;
+                            }
+
+                            prevIncomplete = Vector512<byte>.Zero;
+
+                            // Often, we have a lot of ASCII characters in a row.
+                            var localasciirun = 64;
+                            if (processedLength + localasciirun + 64 <= inputLength)
+                            {
+                                for (; processedLength + localasciirun + 64 <= inputLength; localasciirun += 64)
+                                {
+                                    var block = Vector512.Load(pInputBuffer + processedLength + localasciirun);
+                                    if (block.ExtractMostSignificantBits() != 0) break;
+                                }
+                                processedLength += localasciirun - 64;
+                            }
+                        }
+                        else // Contains non-ASCII characters, we need to do non-trivial processing
+                        {
+                            // Use SubtractSaturate to effectively compare if bytes in block are greater than markers.
+                            var movemask = Vector512.Create(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+                            var shuffled = Avx512F
+                                .PermuteVar16x32x2(currentBlock.AsInt32(), movemask, prevInputBlock.AsInt32()).AsByte();
+                            prevInputBlock = currentBlock;
+
+                            var prev1 = Avx512BW.AlignRight(prevInputBlock, shuffled, 16 - 1);
+                            var byte_1_high = Avx512BW.Shuffle(shuf1,
+                                Avx512BW.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() &
+                                v0f); // takes the XXXX 0000 part of the previous byte
+                            var byte_1_low =
+                                Avx512BW.Shuffle(shuf2, prev1 & v0f); // takes the 0000 XXXX part of the previous part
+                            var byte_2_high = Avx512BW.Shuffle(shuf3,
+                                Avx512BW.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() &
+                                v0f); // takes the XXXX 0000 part of the current byte
+                            var sc = Avx512F.And(Avx512F.And(byte_1_high, byte_1_low), byte_2_high);
+                            var prev2 = Avx512BW.AlignRight(prevInputBlock, shuffled, 16 - 2);
+                            var prev3 = Avx512BW.AlignRight(prevInputBlock, shuffled, 16 - 3);
+                            var isThirdByte = Avx512BW.SubtractSaturate(prev2, thirdByte);
+                            var isFourthByte = Avx512BW.SubtractSaturate(prev3, fourthByte);
+                            var must23 = Avx512F.Or(isThirdByte, isFourthByte);
+                            var must23As80 = Avx512F.And(must23, v80);
+                            var error = Avx512F.Xor(must23As80, sc);
+
+                            if (Avx512BW.CompareGreaterThan(error, Vector512<byte>.Zero).ExtractMostSignificantBits() != 0)
+                            {
+                                byte* invalidBytePointer;
+                                if (processedLength == 0)
+                                    invalidBytePointer = SimpleRewindAndValidateWithErrors(0,
+                                        pInputBuffer + processedLength, inputLength - processedLength);
+                                else
+                                    invalidBytePointer = SimpleRewindAndValidateWithErrors(3,
+                                        pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+                                if (invalidBytePointer < pInputBuffer + processedLength)
+                                    RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4,
+                                        ref contbytes);
+                                else
+                                    AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) =
+                                    CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                                return invalidBytePointer;
+                            }
+
+                            prevIncomplete = Avx512BW.SubtractSaturate(currentBlock, maxValue);
+                            contbytes += BitOperations.PopCount(byte_2_high.ExtractMostSignificantBits());
+                            // We use two instructions (SubtractSaturate and ExtractMostSignificantBits) to update n4, with one arithmetic operation.
+                            n4 += BitOperations.PopCount(Avx512BW.SubtractSaturate(currentBlock, fourthByte)
+                                .ExtractMostSignificantBits());
+                        }
+                    }
+
+                    // We may still have an error.
+                    var hasIncompete = Avx512BW.CompareGreaterThan(prevIncomplete, Vector512<byte>.Zero)
+                        .ExtractMostSignificantBits() != 0;
+                    if (processedLength < inputLength || hasIncompete)
+                    {
+                        byte* invalidBytePointer;
+                        if (processedLength == 0 || !hasIncompete)
+                            invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength,
+                                inputLength - processedLength);
+                        else
+                            invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3,
+                                inputLength - processedLength + 3);
+                        if (invalidBytePointer != pInputBuffer + inputLength)
+                        {
+                            if (invalidBytePointer < pInputBuffer + processedLength)
+                                RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref contbytes);
+                            else
+                                AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                            (utf16CodeUnitCountAdjustment, scalarCountAdjustment) =
+                                CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                            return invalidBytePointer;
+                        }
+
+                        AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                    }
+
+                    (utf16CodeUnitCountAdjustment, scalarCountAdjustment) =
+                        CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                    return pInputBuffer + inputLength;
+                }
+            }
+
+            return GetPointerToFirstInvalidByteFallback(pInputBuffer + processedLength, inputLength - processedLength,
+                out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
+        }
+
+        [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
+        public static unsafe byte* GetPointerToFirstInvalidByteArm64(byte* pInputBuffer, int inputLength,
+            out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
+        {
+            var processedLength = 0;
+            Debug.Assert(inputLength > 128);
+            {
+                if (processedLength + 32 < inputLength)
+                {
+                    // We still have work to do!
+                    var prevInputBlock = Vector128<byte>.Zero;
+                    var maxValue = Vector128.Create(
+                        255, 255, 255, 255, 255, 255, 255, 255,
+                        255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1);
+                    var prevIncomplete = AdvSimd.SubtractSaturate(prevInputBlock, maxValue);
+                    var shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+                        TOO_SHORT | OVERLONG_2,
+                        TOO_SHORT,
+                        TOO_SHORT | OVERLONG_3 | SURROGATE,
+                        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+                    var shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+                        CARRY | OVERLONG_2,
+                        CARRY,
+                        CARRY,
+                        CARRY | TOO_LARGE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000,
+                        CARRY | TOO_LARGE | TOO_LARGE_1000);
+                    var shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+                        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+                    var thirdByte = Vector128.Create((byte)(0b11100000u - 0x80));
+                    var fourthByte = Vector128.Create((byte)(0b11110000u - 0x80));
+                    var v0f = Vector128.Create((byte)0x0F);
+                    var v80 = Vector128.Create((byte)0x80);
+                    // Performance note: we could process 64 bytes at a time for better speed in some cases.
+
+                    // The block goes from processedLength to processedLength/16*16.
+                    var contbytes = 0; // number of continuation bytes in the block
+                    var n4 = 0; // number of 4-byte sequences that start in this block
+                    for (; processedLength + 16 <= inputLength; processedLength += 16)
+                    {
+                        var currentBlock = Vector128.Load(pInputBuffer + processedLength);
+                        if (AdvSimd.Arm64.MaxAcross(AdvSimd.And(currentBlock, v80).AsUInt32()).ToScalar() == 0)
+                        // We could it with (AdvSimd.Arm64.MaxAcross(currentBlock).ToScalar() <= 127) but it is slower on some
+                        // hardware.
+                        {
+                            // We have an ASCII block, no need to process it, but
+                            // we need to check if the previous block was incomplete.
+                            if (AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0)
+                            {
+                                var invalidBytePointer = SimpleRewindAndValidateWithErrors(16 - 3,
+                                    pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+                                // So the code is correct up to invalidBytePointer
+                                if (invalidBytePointer < pInputBuffer + processedLength)
+                                    RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4,
+                                        ref contbytes);
+                                else
+                                    AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) =
+                                    CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                                return invalidBytePointer;
+                            }
+
+                            prevIncomplete = Vector128<byte>.Zero;
+                            // Often, we have a lot of ASCII characters in a row.
+                            var localasciirun = 16;
+                            if (processedLength + localasciirun + 64 <= inputLength)
+                            {
+                                for (; processedLength + localasciirun + 64 <= inputLength; localasciirun += 64)
+                                {
+                                    var block1 = Vector128.Load(pInputBuffer + processedLength + localasciirun);
+                                    var block2 = Vector128.Load(pInputBuffer + processedLength + localasciirun + 16);
+                                    var block3 = Vector128.Load(pInputBuffer + processedLength + localasciirun + 32);
+                                    var block4 = Vector128.Load(pInputBuffer + processedLength + localasciirun + 48);
+                                    var or = AdvSimd.Or(AdvSimd.Or(block1, block2), AdvSimd.Or(block3, block4));
+                                    if (AdvSimd.Arm64.MaxAcross(or).ToScalar() > 127) break;
+                                }
+
+                                processedLength += localasciirun - 16;
+                            }
+                        }
+                        else
+                        {
+                            // Contains non-ASCII characters, we need to do non-trivial processing
+                            var prev1 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, 16 - 1);
+                            // Vector128.Shuffle vs AdvSimd.Arm64.VectorTableLookup: prefer the latter!!!
+                            var byte_1_high = AdvSimd.Arm64.VectorTableLookup(shuf1,
+                                AdvSimd.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
+                            var byte_1_low = AdvSimd.Arm64.VectorTableLookup(shuf2, prev1 & v0f);
+                            var byte_2_high = AdvSimd.Arm64.VectorTableLookup(shuf3,
+                                AdvSimd.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
+                            var sc = AdvSimd.And(AdvSimd.And(byte_1_high, byte_1_low), byte_2_high);
+                            var prev2 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, 16 - 2);
+                            var prev3 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, 16 - 3);
+                            prevInputBlock = currentBlock;
+                            var isThirdByte = AdvSimd.SubtractSaturate(prev2, thirdByte);
+                            var isFourthByte = AdvSimd.SubtractSaturate(prev3, fourthByte);
+                            var must23 = AdvSimd.Or(isThirdByte, isFourthByte);
+                            var must23As80 = AdvSimd.And(must23, v80);
+                            var error = AdvSimd.Xor(must23As80, sc);
+                            // AdvSimd.Arm64.MaxAcross(error) works, but it might be slower
+                            // than AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(error)) on some
+                            // hardware:
+                            if (AdvSimd.Arm64.MaxAcross(error.AsUInt32()).ToScalar() != 0)
+                            {
+                                byte* invalidBytePointer;
+                                if (processedLength == 0)
+                                    invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength);
+                                else
+                                    invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+                                if (invalidBytePointer < pInputBuffer + processedLength)
+                                    RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref contbytes);
+                                else
+                                    AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                                return invalidBytePointer;
+                            }
+
+                            prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue);
+                            var largestcont = Vector128.Create((sbyte)-65); // -65 => 0b10111111
+                            contbytes += -AdvSimd.Arm64
+                                .AddAcross(AdvSimd.CompareLessThanOrEqual(currentBlock.AsSByte(), largestcont)).ToScalar();
+
+                            // computing n4 is more expensive than we would like:
+                            var fourthByteMinusOne = Vector128.Create((byte)(0b11110000u - 1));
+                            var largerthan0f = AdvSimd.CompareGreaterThan(currentBlock, fourthByteMinusOne);
+                            var n4add = AdvSimd.Arm64.AddAcross(largerthan0f).ToScalar();
+                            int negn4add = (byte)-n4add;
+                            n4 += negn4add;
+                        }
+                    }
+
+                    var hasIncompete = AdvSimd.Arm64.MaxAcross(prevIncomplete.AsUInt32()).ToScalar() != 0;
+                    if (processedLength < inputLength || hasIncompete)
+                    {
+                        byte* invalidBytePointer;
+                        if (processedLength == 0 || !hasIncompete)
+                            invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength,
+                                inputLength - processedLength);
+                        else
+                            invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3,
+                                inputLength - processedLength + 3);
+                        if (invalidBytePointer != pInputBuffer + inputLength)
+                        {
+                            if (invalidBytePointer < pInputBuffer + processedLength)
+                                RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref contbytes);
+                            else
+                                AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                            (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                            return invalidBytePointer;
+                        }
+
+                        AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                    }
+
+                    (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                    return pInputBuffer + inputLength;
+                }
+            }
+
+            return GetPointerToFirstInvalidByteFallback(pInputBuffer + processedLength, inputLength - processedLength,
+                out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void RemoveCounters(byte* start, byte* end, ref int n4, ref int contbytes)
+        {
+            for (var p = start; p < end; p++)
+            {
+                if ((*p & 0b11000000) == 0b10000000) contbytes -= 1;
+                if ((*p & 0b11110000) == 0b11110000) n4 -= 1;
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void AddCounters(byte* start, byte* end, ref int n4, ref int contbytes)
+        {
+            for (var p = start; p < end; p++)
+            {
+                if ((*p & 0b11000000) == 0b10000000) contbytes += 1;
+                if ((*p & 0b11110000) == 0b11110000) n4 += 1;
+            }
+        }
     }
 }

From 5d777c7fda8ab2b4e4e19a6922460a0900cf7ac3 Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Tue, 25 Jun 2024 23:52:12 +0200
Subject: [PATCH 2/3] apply patch from the upstream

---
 .../Text/Unicode/Utf8Utility.Validation.cs    | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
index c1b0fc942788f..9f437657e455d 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -1640,6 +1640,8 @@ private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustmen
                     var fourthByte = Vector128.Create((byte)(0b11110000u - 0x80));
                     var v0f = Vector128.Create((byte)0x0F);
                     var v80 = Vector128.Create((byte)0x80);
+                    Vector128<byte> fourthByteMinusOne = Vector128.Create((byte)(0b11110000u - 1));
+                    Vector128<sbyte> largestcont = Vector128.Create((sbyte)-65); // -65 => 0b10111111
                     // Performance note: we could process 64 bytes at a time for better speed in some cases.
 
                     // The block goes from processedLength to processedLength/16*16.
@@ -1648,13 +1650,13 @@ private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustmen
                     for (; processedLength + 16 <= inputLength; processedLength += 16)
                     {
                         var currentBlock = Vector128.Load(pInputBuffer + processedLength);
-                        if (AdvSimd.Arm64.MaxAcross(AdvSimd.And(currentBlock, v80).AsUInt32()).ToScalar() == 0)
+                        if ((currentBlock & v80) == Vector128<byte>.Zero)
                         // We could it with (AdvSimd.Arm64.MaxAcross(currentBlock).ToScalar() <= 127) but it is slower on some
                         // hardware.
                         {
                             // We have an ASCII block, no need to process it, but
                             // we need to check if the previous block was incomplete.
-                            if (AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0)
+                            if (prevIncomplete != Vector128<byte>.Zero)
                             {
                                 var invalidBytePointer = SimpleRewindAndValidateWithErrors(16 - 3,
                                     pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
@@ -1681,7 +1683,7 @@ private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustmen
                                     var block3 = Vector128.Load(pInputBuffer + processedLength + localasciirun + 32);
                                     var block4 = Vector128.Load(pInputBuffer + processedLength + localasciirun + 48);
                                     var or = AdvSimd.Or(AdvSimd.Or(block1, block2), AdvSimd.Or(block3, block4));
-                                    if (AdvSimd.Arm64.MaxAcross(or).ToScalar() > 127) break;
+                                    if ((or & v80) != Vector128<byte>.Zero) break;
                                 }
 
                                 processedLength += localasciirun - 16;
@@ -1709,7 +1711,7 @@ private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustmen
                             // AdvSimd.Arm64.MaxAcross(error) works, but it might be slower
                             // than AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(error)) on some
                             // hardware:
-                            if (AdvSimd.Arm64.MaxAcross(error.AsUInt32()).ToScalar() != 0)
+                            if (error != Vector128<byte>.Zero)
                             {
                                 byte* invalidBytePointer;
                                 if (processedLength == 0)
@@ -1725,20 +1727,21 @@ private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustmen
                             }
 
                             prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue);
-                            var largestcont = Vector128.Create((sbyte)-65); // -65 => 0b10111111
                             contbytes += -AdvSimd.Arm64
                                 .AddAcross(AdvSimd.CompareLessThanOrEqual(currentBlock.AsSByte(), largestcont)).ToScalar();
 
                             // computing n4 is more expensive than we would like:
-                            var fourthByteMinusOne = Vector128.Create((byte)(0b11110000u - 1));
                             var largerthan0f = AdvSimd.CompareGreaterThan(currentBlock, fourthByteMinusOne);
-                            var n4add = AdvSimd.Arm64.AddAcross(largerthan0f).ToScalar();
-                            int negn4add = (byte)-n4add;
-                            n4 += negn4add;
+                            if (largerthan0f != Vector128<byte>.Zero)
+                            {
+                                byte n4add = (byte)AdvSimd.Arm64.AddAcross(largerthan0f).ToScalar();
+                                int negn4add = (int)(byte)-n4add;
+                                n4 += negn4add;
+                            }
                         }
                     }
 
-                    var hasIncompete = AdvSimd.Arm64.MaxAcross(prevIncomplete.AsUInt32()).ToScalar() != 0;
+                    var hasIncompete = (prevIncomplete != Vector128<byte>.Zero);
                     if (processedLength < inputLength || hasIncompete)
                     {
                         byte* invalidBytePointer;

From eaf218d9f0e03bd47cb329f71a4fe870b5306fa9 Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Sat, 29 Jun 2024 14:52:42 +0200
Subject: [PATCH 3/3] async with upstream

---
 .../Text/Unicode/Utf8Utility.Validation.cs    | 278 ++++++++++--------
 1 file changed, 161 insertions(+), 117 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
index 9f437657e455d..784c3ae7ef68d 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
@@ -1592,28 +1592,25 @@ private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustmen
         }
 
         [CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
-        public static unsafe byte* GetPointerToFirstInvalidByteArm64(byte* pInputBuffer, int inputLength,
-            out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
+        private static byte* GetPointerToFirstInvalidByteArm64(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
         {
-            var processedLength = 0;
-            Debug.Assert(inputLength > 128);
+            int processedLength = 0;
+            if (processedLength + 32 < inputLength)
             {
-                if (processedLength + 32 < inputLength)
-                {
-                    // We still have work to do!
-                    var prevInputBlock = Vector128<byte>.Zero;
-                    var maxValue = Vector128.Create(
+                // We still have work to do!
+                Vector128<byte> prevInputBlock = Vector128<byte>.Zero;
+                Vector128<byte> maxValue = Vector128.Create(
                         255, 255, 255, 255, 255, 255, 255, 255,
                         255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1);
-                    var prevIncomplete = AdvSimd.SubtractSaturate(prevInputBlock, maxValue);
-                    var shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+                Vector128<byte> prevIncomplete = AdvSimd.SubtractSaturate(prevInputBlock, maxValue);
+                Vector128<byte> shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
                         TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
                         TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
                         TOO_SHORT | OVERLONG_2,
                         TOO_SHORT,
                         TOO_SHORT | OVERLONG_3 | SURROGATE,
                         TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
-                    var shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+                Vector128<byte> shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
                         CARRY | OVERLONG_2,
                         CARRY,
                         CARRY,
@@ -1629,146 +1626,193 @@ private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustmen
                         CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
                         CARRY | TOO_LARGE | TOO_LARGE_1000,
                         CARRY | TOO_LARGE | TOO_LARGE_1000);
-                    var shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+                Vector128<byte> shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
                         TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
                         TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
                         TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
                         TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
                         TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
                         TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-                    var thirdByte = Vector128.Create((byte)(0b11100000u - 0x80));
-                    var fourthByte = Vector128.Create((byte)(0b11110000u - 0x80));
-                    var v0f = Vector128.Create((byte)0x0F);
-                    var v80 = Vector128.Create((byte)0x80);
-                    Vector128<byte> fourthByteMinusOne = Vector128.Create((byte)(0b11110000u - 1));
-                    Vector128<sbyte> largestcont = Vector128.Create((sbyte)-65); // -65 => 0b10111111
-                    // Performance note: we could process 64 bytes at a time for better speed in some cases.
+                Vector128<byte> thirdByte = Vector128.Create((byte)(0b11100000u - 0x80));
+                Vector128<byte> fourthByte = Vector128.Create((byte)(0b11110000u - 0x80));
+                Vector128<byte> v0f = Vector128.Create((byte)0x0F);
+                Vector128<byte> v80 = Vector128.Create((byte)0x80);
+                Vector128<byte> fourthByteMinusOne = Vector128.Create((byte)(0b11110000u - 1));
+                Vector128<sbyte> largestcont = Vector128.Create((sbyte)-65); // -65 => 0b10111111
+                // Performance note: we could process 64 bytes at a time for better speed in some cases.
+
+                // The block goes from processedLength to processedLength/16*16.
+                int contbytes = 0; // number of continuation bytes in the block
+                int n4 = 0; // number of 4-byte sequences that start in this block
+                /////
+                // Design:
+                // Instead of updating n4 and contbytes continuously, we accumulate
+                // the values in n4v and contv, while using overflowCounter to make
+                // sure we do not overflow. This allows you to reach good performance
+                // on systems where summing across vectors is slow.
+                ////
+                Vector128<sbyte> n4v = Vector128<sbyte>.Zero;
+                Vector128<sbyte> contv = Vector128<sbyte>.Zero;
+                int overflowCounter = 0;
+                for (; processedLength + 16 <= inputLength; processedLength += 16)
+                {
 
-                    // The block goes from processedLength to processedLength/16*16.
-                    var contbytes = 0; // number of continuation bytes in the block
-                    var n4 = 0; // number of 4-byte sequences that start in this block
-                    for (; processedLength + 16 <= inputLength; processedLength += 16)
+                    Vector128<byte> currentBlock = AdvSimd.LoadVector128(pInputBuffer + processedLength);
+                    if ((currentBlock & v80) == Vector128<byte>.Zero)
                     {
-                        var currentBlock = Vector128.Load(pInputBuffer + processedLength);
-                        if ((currentBlock & v80) == Vector128<byte>.Zero)
-                        // We could it with (AdvSimd.Arm64.MaxAcross(currentBlock).ToScalar() <= 127) but it is slower on some
-                        // hardware.
+                        // We have an ASCII block, no need to process it, but
+                        // we need to check if the previous block was incomplete.
+                        if (prevIncomplete != Vector128<byte>.Zero)
                         {
-                            // We have an ASCII block, no need to process it, but
-                            // we need to check if the previous block was incomplete.
-                            if (prevIncomplete != Vector128<byte>.Zero)
+                            contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
+                            if (n4v != Vector128<sbyte>.Zero)
                             {
-                                var invalidBytePointer = SimpleRewindAndValidateWithErrors(16 - 3,
-                                    pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
-                                // So the code is correct up to invalidBytePointer
-                                if (invalidBytePointer < pInputBuffer + processedLength)
-                                    RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4,
-                                        ref contbytes);
-                                else
-                                    AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
-                                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) =
-                                    CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
-                                return invalidBytePointer;
+                                n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
                             }
-
-                            prevIncomplete = Vector128<byte>.Zero;
-                            // Often, we have a lot of ASCII characters in a row.
-                            var localasciirun = 16;
-                            if (processedLength + localasciirun + 64 <= inputLength)
+                            byte* invalidBytePointer = SimpleRewindAndValidateWithErrors(16 - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+                            // So the code is correct up to invalidBytePointer
+                            if (invalidBytePointer < pInputBuffer + processedLength)
                             {
-                                for (; processedLength + localasciirun + 64 <= inputLength; localasciirun += 64)
-                                {
-                                    var block1 = Vector128.Load(pInputBuffer + processedLength + localasciirun);
-                                    var block2 = Vector128.Load(pInputBuffer + processedLength + localasciirun + 16);
-                                    var block3 = Vector128.Load(pInputBuffer + processedLength + localasciirun + 32);
-                                    var block4 = Vector128.Load(pInputBuffer + processedLength + localasciirun + 48);
-                                    var or = AdvSimd.Or(AdvSimd.Or(block1, block2), AdvSimd.Or(block3, block4));
-                                    if ((or & v80) != Vector128<byte>.Zero) break;
-                                }
-
-                                processedLength += localasciirun - 16;
+                                RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref contbytes);
                             }
+                            else
+                            {
+                                AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                            }
+                            (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                            return invalidBytePointer;
                         }
-                        else
+                        prevIncomplete = Vector128<byte>.Zero;
+                        // Often, we have a lot of ASCII characters in a row.
+                        int localasciirun = 16;
+                        if (processedLength + localasciirun + 16 <= inputLength)
                         {
-                            // Contains non-ASCII characters, we need to do non-trivial processing
-                            var prev1 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, 16 - 1);
-                            // Vector128.Shuffle vs AdvSimd.Arm64.VectorTableLookup: prefer the latter!!!
-                            var byte_1_high = AdvSimd.Arm64.VectorTableLookup(shuf1,
-                                AdvSimd.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
-                            var byte_1_low = AdvSimd.Arm64.VectorTableLookup(shuf2, prev1 & v0f);
-                            var byte_2_high = AdvSimd.Arm64.VectorTableLookup(shuf3,
-                                AdvSimd.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
-                            var sc = AdvSimd.And(AdvSimd.And(byte_1_high, byte_1_low), byte_2_high);
-                            var prev2 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, 16 - 2);
-                            var prev3 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, 16 - 3);
-                            prevInputBlock = currentBlock;
-                            var isThirdByte = AdvSimd.SubtractSaturate(prev2, thirdByte);
-                            var isFourthByte = AdvSimd.SubtractSaturate(prev3, fourthByte);
-                            var must23 = AdvSimd.Or(isThirdByte, isFourthByte);
-                            var must23As80 = AdvSimd.And(must23, v80);
-                            var error = AdvSimd.Xor(must23As80, sc);
-                            // AdvSimd.Arm64.MaxAcross(error) works, but it might be slower
-                            // than AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(error)) on some
-                            // hardware:
-                            if (error != Vector128<byte>.Zero)
+                            Vector128<byte> block = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun);
+                            if (AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(AdvSimd.And(block, v80))).ToScalar() == 0)
                             {
-                                byte* invalidBytePointer;
-                                if (processedLength == 0)
-                                    invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength);
-                                else
-                                    invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
-                                if (invalidBytePointer < pInputBuffer + processedLength)
-                                    RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref contbytes);
-                                else
-                                    AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
-                                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
-                                return invalidBytePointer;
-                            }
+                                localasciirun += 16;
+                                for (; processedLength + localasciirun + 64 <= inputLength; localasciirun += 64)
+                                {
+                                    Vector128<byte> block1 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun);
+                                    Vector128<byte> block2 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 16);
+                                    Vector128<byte> block3 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 32);
+                                    Vector128<byte> block4 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 48);
+                                    Vector128<byte> or = AdvSimd.Or(AdvSimd.Or(block1, block2), AdvSimd.Or(block3, block4));
 
-                            prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue);
-                            contbytes += -AdvSimd.Arm64
-                                .AddAcross(AdvSimd.CompareLessThanOrEqual(currentBlock.AsSByte(), largestcont)).ToScalar();
+                                    if ((or & v80) != Vector128<byte>.Zero)
+                                    {
+                                        break;
+                                    }
+                                }
 
-                            // computing n4 is more expensive than we would like:
-                            var largerthan0f = AdvSimd.CompareGreaterThan(currentBlock, fourthByteMinusOne);
-                            if (largerthan0f != Vector128<byte>.Zero)
-                            {
-                                byte n4add = (byte)AdvSimd.Arm64.AddAcross(largerthan0f).ToScalar();
-                                int negn4add = (int)(byte)-n4add;
-                                n4 += negn4add;
                             }
+
+                            processedLength += localasciirun - 16;
                         }
                     }
-
-                    var hasIncompete = (prevIncomplete != Vector128<byte>.Zero);
-                    if (processedLength < inputLength || hasIncompete)
+                    else
                     {
-                        byte* invalidBytePointer;
-                        if (processedLength == 0 || !hasIncompete)
-                            invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength,
-                                inputLength - processedLength);
-                        else
-                            invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3,
-                                inputLength - processedLength + 3);
-                        if (invalidBytePointer != pInputBuffer + inputLength)
+                        // Contains non-ASCII characters, we need to do non-trivial processing
+                        Vector128<byte> prev1 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 1));
+                        // Vector128.Shuffle vs AdvSimd.Arm64.VectorTableLookup: prefer the latter!!!
+                        Vector128<byte> byte_1_high = AdvSimd.Arm64.VectorTableLookup(shuf1, AdvSimd.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
+                        Vector128<byte> byte_1_low = AdvSimd.Arm64.VectorTableLookup(shuf2, (prev1 & v0f));
+                        Vector128<byte> byte_2_high = AdvSimd.Arm64.VectorTableLookup(shuf3, AdvSimd.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
+                        Vector128<byte> sc = AdvSimd.And(AdvSimd.And(byte_1_high, byte_1_low), byte_2_high);
+                        Vector128<byte> prev2 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 2));
+                        Vector128<byte> prev3 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 3));
+                        prevInputBlock = currentBlock;
+                        Vector128<byte> isThirdByte = AdvSimd.SubtractSaturate(prev2, thirdByte);
+                        Vector128<byte> isFourthByte = AdvSimd.SubtractSaturate(prev3, fourthByte);
+                        Vector128<byte> must23 = AdvSimd.Or(isThirdByte, isFourthByte);
+                        Vector128<byte> must23As80 = AdvSimd.And(must23, v80);
+                        Vector128<byte> error = AdvSimd.Xor(must23As80, sc);
+                        if (error != Vector128<byte>.Zero)
                         {
+                            contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
+                            if (n4v != Vector128<sbyte>.Zero)
+                            {
+                                n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
+                            }
+                            byte* invalidBytePointer;
+                            if (processedLength == 0)
+                            {
+                                invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength);
+                            }
+                            else
+                            {
+                                invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+                            }
                             if (invalidBytePointer < pInputBuffer + processedLength)
+                            {
                                 RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref contbytes);
+                            }
                             else
+                            {
                                 AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                            }
                             (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
                             return invalidBytePointer;
                         }
+                        prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue);
+                        contv += AdvSimd.CompareLessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont);
+                        n4v += AdvSimd.CompareGreaterThan(currentBlock, fourthByteMinusOne).AsSByte();
+                        overflowCounter++;
+                        // We have a risk of overflow if overflowCounter reaches 255,
+                        // in which case, we empty contv and n4v, and update contbytes and
+                        // n4.
+                        if (overflowCounter == 0xff)
+                        {
+                            overflowCounter = 0;
+                            contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
+                            contv = Vector128<sbyte>.Zero;
+                            if (n4v != Vector128<sbyte>.Zero)
+                            {
+                                n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
+                                n4v = Vector128<sbyte>.Zero;
+                            }
+                        }
+                    }
+                }
+                contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
+                if (n4v != Vector128<sbyte>.Zero)
+                {
+                    n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
+                }
 
+                bool hasIncompete = (prevIncomplete != Vector128<byte>.Zero);
+                if (processedLength < inputLength || hasIncompete)
+                {
+                    byte* invalidBytePointer;
+                    if (processedLength == 0 || !hasIncompete)
+                    {
+                        invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength);
+                    }
+                    else
+                    {
+                        invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+                    }
+                    if (invalidBytePointer != pInputBuffer + inputLength)
+                    {
+                        if (invalidBytePointer < pInputBuffer + processedLength)
+                        {
+                            RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref contbytes);
+                        }
+                        else
+                        {
+                            AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
+                        }
+                        (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                        return invalidBytePointer;
+                    }
+                    else
+                    {
                         AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref contbytes);
                     }
-
-                    (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
-                    return pInputBuffer + inputLength;
                 }
+                (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(n4, contbytes);
+                return pInputBuffer + inputLength;
             }
-
             return GetPointerToFirstInvalidByteFallback(pInputBuffer + processedLength, inputLength - processedLength,
                 out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
         }