|
| 1 | +// Licensed to the .NET Foundation under one or more agreements. |
| 2 | +// The .NET Foundation licenses this file to you under the MIT license. |
| 3 | +// See the LICENSE file in the project root for more information. |
| 4 | + |
| 5 | +using System.Buffers; |
| 6 | +using System.Globalization; |
| 7 | +using System.Linq; |
| 8 | +using System.Reflection; |
| 9 | +using System.Runtime.InteropServices; |
| 10 | +using Xunit; |
| 11 | + |
| 12 | +namespace System.Text.Unicode.Tests |
| 13 | +{ |
| 14 | + public partial class Utf16UtilityTests |
| 15 | + { |
| 16 | + private unsafe delegate char* GetPointerToFirstInvalidCharDel(char* pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment); |
| 17 | + private static readonly Lazy<GetPointerToFirstInvalidCharDel> _getPointerToFirstInvalidCharFn = CreateGetPointerToFirstInvalidCharFn(); |
| 18 | + |
| 19 | + [Theory] |
| 20 | + [InlineData("", 0, 0)] // empty string is OK |
| 21 | + [InlineData("X", 1, 1)] |
| 22 | + [InlineData("XY", 2, 2)] |
| 23 | + [InlineData("XYZ", 3, 3)] |
| 24 | + [InlineData("<EACU>", 1, 2)] |
| 25 | + [InlineData("X<EACU>", 2, 3)] |
| 26 | + [InlineData("<EACU>X", 2, 3)] |
| 27 | + [InlineData("<EURO>", 1, 3)] |
| 28 | + [InlineData("<GRIN>", 1, 4)] |
| 29 | + [InlineData("X<GRIN>Z", 3, 6)] |
| 30 | + [InlineData("X<0000>Z", 3, 3)] // null chars are allowed |
| 31 | + public void GetIndexOfFirstInvalidUtf16Sequence_WithSmallValidBuffers(string unprocessedInput, int expectedRuneCount, int expectedUtf8ByteCount) |
| 32 | + { |
| 33 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(unprocessedInput, -1 /* expectedIdxOfFirstInvalidChar */, expectedRuneCount, expectedUtf8ByteCount); |
| 34 | + } |
| 35 | + |
| 36 | + [Theory] |
| 37 | + [InlineData("<DC00>", 0, 0, 0)] // standalone low surrogate (at beginning of sequence) |
| 38 | + [InlineData("X<DC00>", 1, 1, 1)] // standalone low surrogate (preceded by valid ASCII data) |
| 39 | + [InlineData("<EURO><DC00>", 1, 1, 3)] // standalone low surrogate (preceded by valid non-ASCII data) |
| 40 | + [InlineData("<D800>", 0, 0, 0)] // standalone high surrogate (missing follow-up low surrogate) |
| 41 | + [InlineData("<D800>Y", 0, 0, 0)] // standalone high surrogate (followed by ASCII char) |
| 42 | + [InlineData("<D800><D800>", 0, 0, 0)] // standalone high surrogate (followed by high surrogate) |
| 43 | + [InlineData("<D800><EURO>", 0, 0, 0)] // standalone high surrogate (followed by valid non-ASCII char) |
| 44 | + [InlineData("<DC00><DC00>", 0, 0, 0)] // standalone low surrogate (not preceded by a high surrogate) |
| 45 | + [InlineData("<DC00><D800>", 0, 0, 0)] // standalone low surrogate (not preceded by a high surrogate) |
| 46 | + [InlineData("<GRIN><DC00><DC00>", 2, 1, 4)] // standalone low surrogate (preceded by a valid surrogate pair) |
| 47 | + [InlineData("<GRIN><DC00><D800>", 2, 1, 4)] // standalone low surrogate (preceded by a valid surrogate pair) |
| 48 | + [InlineData("<GRIN><0000><DC00><D800>", 3, 2, 5)] // standalone low surrogate (preceded by a valid null char) |
| 49 | + public void GetIndexOfFirstInvalidUtf16Sequence_WithSmallInvalidBuffers(string unprocessedInput, int idxOfFirstInvalidChar, int expectedRuneCount, int expectedUtf8ByteCount) |
| 50 | + { |
| 51 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(unprocessedInput, idxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount); |
| 52 | + } |
| 53 | + |
| 54 | + [Fact] |
| 55 | + public void GetIndexOfFirstInvalidUtf16Sequence_WithInvalidSurrogateSequences() |
| 56 | + { |
| 57 | + // All ASCII |
| 58 | + |
| 59 | + char[] chars = Enumerable.Repeat('x', 128).ToArray(); |
| 60 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 128, expectedUtf8ByteCount: 128); |
| 61 | + |
| 62 | + // Throw a surrogate pair at the beginning |
| 63 | + |
| 64 | + chars[0] = '\uD800'; |
| 65 | + chars[1] = '\uDFFF'; |
| 66 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 127, expectedUtf8ByteCount: 130); |
| 67 | + |
| 68 | + // Throw a surrogate pair near the end |
| 69 | + |
| 70 | + chars[124] = '\uD800'; |
| 71 | + chars[125] = '\uDFFF'; |
| 72 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 126, expectedUtf8ByteCount: 132); |
| 73 | + |
| 74 | + // Throw a standalone surrogate code point at the *very* end |
| 75 | + |
| 76 | + chars[127] = '\uD800'; // high surrogate |
| 77 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 127, expectedRuneCount: 125, expectedUtf8ByteCount: 131); |
| 78 | + |
| 79 | + chars[127] = '\uDFFF'; // low surrogate |
| 80 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 127, expectedRuneCount: 125, expectedUtf8ByteCount: 131); |
| 81 | + |
| 82 | + // Make the final surrogate pair valid |
| 83 | + |
| 84 | + chars[126] = '\uD800'; // high surrogate |
| 85 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 125, expectedUtf8ByteCount: 134); |
| 86 | + |
| 87 | + // Throw an invalid surrogate sequence in the middle (straddles a vector boundary) |
| 88 | + |
| 89 | + chars[12] = '\u0080'; // 2-byte UTF-8 sequence |
| 90 | + chars[13] = '\uD800'; // high surrogate |
| 91 | + chars[14] = '\uD800'; // high surrogate |
| 92 | + chars[15] = '\uDFFF'; // low surrogate |
| 93 | + chars[16] = '\uDFFF'; // low surrogate |
| 94 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 13, expectedRuneCount: 12, expectedUtf8ByteCount: 16); |
| 95 | + |
| 96 | + // Correct the surrogate sequence we just added |
| 97 | + |
| 98 | + chars[14] = '\uDC00'; // low surrogate |
| 99 | + chars[15] = '\uDBFF'; // high surrogate |
| 100 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 123, expectedUtf8ByteCount: 139); |
| 101 | + |
| 102 | + // Corrupt the surrogate pair that's split across a vector boundary |
| 103 | + |
| 104 | + chars[16] = 'x'; // ASCII char (remember.. chars[15] is a high surrogate char) |
| 105 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 15, expectedRuneCount: 13, expectedUtf8ByteCount: 20); |
| 106 | + } |
| 107 | + |
| 108 | + private static void GetIndexOfFirstInvalidUtf16Sequence_Test_Core(string unprocessedInput, int expectedIdxOfFirstInvalidChar, int expectedRuneCount, long expectedUtf8ByteCount) |
| 109 | + { |
| 110 | + char[] processedInput = ProcessInput(unprocessedInput).ToCharArray(); |
| 111 | + |
| 112 | + // Run the test normally |
| 113 | + |
| 114 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount); |
| 115 | + |
| 116 | + // Put a bunch of ASCII data at the beginning (to test the call to ASCIIUtility at method entry) |
| 117 | + |
| 118 | + processedInput = Enumerable.Repeat('x', 128).Concat(processedInput).ToArray(); |
| 119 | + |
| 120 | + if (expectedIdxOfFirstInvalidChar >= 0) |
| 121 | + { |
| 122 | + expectedIdxOfFirstInvalidChar += 128; |
| 123 | + } |
| 124 | + expectedRuneCount += 128; |
| 125 | + expectedUtf8ByteCount += 128; |
| 126 | + |
| 127 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount); |
| 128 | + |
| 129 | + // Change the first few chars to a mixture of 2-byte and 3-byte UTF-8 sequences |
| 130 | + // This makes sure the vectorized code paths can properly handle these. |
| 131 | + |
| 132 | + processedInput[0] = '\u0080'; // 2-byte UTF-8 sequence |
| 133 | + processedInput[1] = '\u0800'; // 3-byte UTF-8 sequence |
| 134 | + processedInput[2] = '\u0080'; // 2-byte UTF-8 sequence |
| 135 | + processedInput[3] = '\u0800'; // 3-byte UTF-8 sequence |
| 136 | + processedInput[4] = '\u0080'; // 2-byte UTF-8 sequence |
| 137 | + processedInput[5] = '\u0800'; // 3-byte UTF-8 sequence |
| 138 | + processedInput[6] = '\u0080'; // 2-byte UTF-8 sequence |
| 139 | + processedInput[7] = '\u0800'; // 3-byte UTF-8 sequence |
| 140 | + |
| 141 | + expectedUtf8ByteCount += 12; |
| 142 | + |
| 143 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount); |
| 144 | + |
| 145 | + // Throw some surrogate pairs into the mix to make sure they're also handled properly |
| 146 | + // by the vectorized code paths. |
| 147 | + |
| 148 | + processedInput[8] = '\u0080'; // 2-byte UTF-8 sequence |
| 149 | + processedInput[9] = '\u0800'; // 3-byte UTF-8 sequence |
| 150 | + processedInput[10] = '\u0080'; // 2-byte UTF-8 sequence |
| 151 | + processedInput[11] = '\u0800'; // 3-byte UTF-8 sequence |
| 152 | + processedInput[12] = '\u0080'; // 2-byte UTF-8 sequence |
| 153 | + processedInput[13] = '\uD800'; // high surrogate |
| 154 | + processedInput[14] = '\uDC00'; // low surrogate |
| 155 | + processedInput[15] = 'z'; // ASCII char |
| 156 | + |
| 157 | + expectedRuneCount--; |
| 158 | + expectedUtf8ByteCount += 9; |
| 159 | + |
| 160 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount); |
| 161 | + |
| 162 | + // Split the next surrogate pair across the vector boundary (so that we |
| 163 | + // don't inadvertently treat this as a standalone surrogate sequence). |
| 164 | + |
| 165 | + processedInput[15] = '\uDBFF'; // high surrogate |
| 166 | + processedInput[16] = '\uDFFF'; // low surrogate |
| 167 | + |
| 168 | + expectedRuneCount--; |
| 169 | + expectedUtf8ByteCount += 2; |
| 170 | + |
| 171 | + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount); |
| 172 | + } |
| 173 | + |
| 174 | + private static unsafe void GetIndexOfFirstInvalidUtf16Sequence_Test_Core(char[] input, int expectedRetVal, int expectedRuneCount, long expectedUtf8ByteCount) |
| 175 | + { |
| 176 | + // Arrange |
| 177 | + |
| 178 | + using BoundedMemory<char> boundedMemory = BoundedMemory.AllocateFromExistingData(input); |
| 179 | + boundedMemory.MakeReadonly(); |
| 180 | + |
| 181 | + // Act |
| 182 | + |
| 183 | + int actualRetVal; |
| 184 | + long actualUtf8CodeUnitCount; |
| 185 | + int actualRuneCount; |
| 186 | + |
| 187 | + fixed (char* pInputBuffer = &MemoryMarshal.GetReference(boundedMemory.Span)) |
| 188 | + { |
| 189 | + char* pFirstInvalidChar = _getPointerToFirstInvalidCharFn.Value(pInputBuffer, input.Length, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment); |
| 190 | + |
| 191 | + long ptrDiff = pFirstInvalidChar - pInputBuffer; |
| 192 | + Assert.True((ulong)ptrDiff <= (uint)input.Length, "ptrDiff was outside expected range."); |
| 193 | + |
| 194 | + Assert.True(utf8CodeUnitCountAdjustment >= 0, "UTF-16 code unit count adjustment must be non-negative."); |
| 195 | + Assert.True(scalarCountAdjustment <= 0, "Scalar count adjustment must be 0 or negative."); |
| 196 | + |
| 197 | + actualRetVal = (ptrDiff == input.Length) ? -1 : (int)ptrDiff; |
| 198 | + |
| 199 | + // The last two 'out' parameters are: |
| 200 | + // a) The number to be added to the "chars processed" return value to come up with the total UTF-8 code unit count, and |
| 201 | + // b) The number to be added to the "total UTF-16 code unit count" value to come up with the total scalar count. |
| 202 | + |
| 203 | + actualUtf8CodeUnitCount = ptrDiff + utf8CodeUnitCountAdjustment; |
| 204 | + actualRuneCount = (int)ptrDiff + scalarCountAdjustment; |
| 205 | + } |
| 206 | + |
| 207 | + // Assert |
| 208 | + |
| 209 | + Assert.Equal(expectedRetVal, actualRetVal); |
| 210 | + Assert.Equal(expectedRuneCount, actualRuneCount); |
| 211 | + Assert.Equal(actualUtf8CodeUnitCount, expectedUtf8ByteCount); |
| 212 | + } |
| 213 | + |
| 214 | + private static Lazy<GetPointerToFirstInvalidCharDel> CreateGetPointerToFirstInvalidCharFn() |
| 215 | + { |
| 216 | + return new Lazy<GetPointerToFirstInvalidCharDel>(() => |
| 217 | + { |
| 218 | + Type utf16UtilityType = typeof(Utf8).Assembly.GetType("System.Text.Unicode.Utf16Utility"); |
| 219 | + |
| 220 | + if (utf16UtilityType is null) |
| 221 | + { |
| 222 | + throw new Exception("Couldn't find Utf16Utility type in System.Private.CoreLib."); |
| 223 | + } |
| 224 | + |
| 225 | + MethodInfo methodInfo = utf16UtilityType.GetMethod("GetPointerToFirstInvalidChar", BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic); |
| 226 | + |
| 227 | + if (methodInfo is null) |
| 228 | + { |
| 229 | + throw new Exception("Couldn't find GetPointerToFirstInvalidChar method on Utf8Utility."); |
| 230 | + } |
| 231 | + |
| 232 | + return (GetPointerToFirstInvalidCharDel)methodInfo.CreateDelegate(typeof(GetPointerToFirstInvalidCharDel)); |
| 233 | + }); |
| 234 | + } |
| 235 | + |
| 236 | + private static string ProcessInput(string input) |
| 237 | + { |
| 238 | + input = input.Replace("<EACU>", "\u00E9", StringComparison.Ordinal); // U+00E9 LATIN SMALL LETTER E WITH ACUTE |
| 239 | + input = input.Replace("<EURO>", "\u20AC", StringComparison.Ordinal); // U+20AC EURO SIGN |
| 240 | + input = input.Replace("<GRIN>", "\U0001F600", StringComparison.Ordinal); // U+1F600 GRINNING FACE |
| 241 | + |
| 242 | + // Replace <ABCD> with \uABCD. This allows us to flow potentially malformed |
| 243 | + // UTF-16 strings without Xunit. (The unit testing framework gets angry when |
| 244 | + // we try putting invalid UTF-16 data as inline test data.) |
| 245 | + |
| 246 | + int idx; |
| 247 | + while ((idx = input.IndexOf('<')) >= 0) |
| 248 | + { |
| 249 | + input = input[..idx] + (char)ushort.Parse(input.Substring(idx + 1, 4), NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture) + input[idx + 6..]; |
| 250 | + } |
| 251 | + |
| 252 | + return input; |
| 253 | + } |
| 254 | + } |
| 255 | +} |
0 commit comments