diff --git a/eng/Version.Details.xml b/eng/Version.Details.xml index 4c813db8f5bb..a4afd3185766 100644 --- a/eng/Version.Details.xml +++ b/eng/Version.Details.xml @@ -1,16 +1,16 @@ - + https://github.com/dotnet/coreclr - 45e04dd1bb1c7171d88a24454cb2c2811f46ce55 + d5865236e7898b730de28a7a6f034e975bb7282e - + https://github.com/dotnet/coreclr - 45e04dd1bb1c7171d88a24454cb2c2811f46ce55 + d5865236e7898b730de28a7a6f034e975bb7282e - + https://github.com/dotnet/coreclr - 45e04dd1bb1c7171d88a24454cb2c2811f46ce55 + d5865236e7898b730de28a7a6f034e975bb7282e diff --git a/eng/Versions.props b/eng/Versions.props index 2c9b2bf77e10..98c099f74694 100644 --- a/eng/Versions.props +++ b/eng/Versions.props @@ -41,8 +41,8 @@ 3.0.0-preview5-27610-11 3.0.0-preview5-27610-11 - 3.0.0-preview5-27610-72 - 3.0.0-preview5-27610-72 + 3.0.0-preview5-27612-73 + 3.0.0-preview5-27612-73 3.0.0-preview5.19211.2 diff --git a/global.json b/global.json index 0450dcb6d21e..60c63e24e559 100644 --- a/global.json +++ b/global.json @@ -5,6 +5,6 @@ "msbuild-sdks": { "Microsoft.DotNet.Arcade.Sdk": "1.0.0-beta.19212.2", "Microsoft.DotNet.Helix.Sdk": "2.0.0-beta.19212.2", - "Microsoft.NET.Sdk.IL": "3.0.0-preview5-27610-72" + "Microsoft.NET.Sdk.IL": "3.0.0-preview5-27612-73" } } diff --git a/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataBuilderTests.cs b/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataBuilderTests.cs index 6665314d7317..4c20e9d6c66b 100644 --- a/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataBuilderTests.cs +++ b/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataBuilderTests.cs @@ -492,7 +492,16 @@ public void GetOrAddDocumentName2() Assert.Equal(@"a/", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n6)))); Assert.Equal(@"/", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n7)))); Assert.Equal(@"\\", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n8)))); - Assert.Equal("\uFFFd\uFFFd", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n9)))); + if (PlatformDetection.IsNetCore) + { + Assert.Equal("\uFFFD\uFFFD\uFFFD", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n9)))); + } + else + { + // Versions of .NET prior to Core 3.0 didn't follow Unicode recommendations for U+FFFD substitution, + // so they sometimes emitted too few replacement chars. + Assert.Equal("\uFFFD\uFFFD", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n9)))); + } Assert.Equal("\0", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n10)))); } } diff --git a/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataRootBuilderTests.cs b/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataRootBuilderTests.cs index b6c05bce30ee..09220310e483 100644 --- a/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataRootBuilderTests.cs +++ b/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataRootBuilderTests.cs @@ -377,11 +377,24 @@ public void MetadataVersion() 0x08, 0x00, 0x00, 0x00, // padded version: + // [ E1 88 B4 ] -> U+1234 + // [ ED ] -> invalid (ED cannot be followed by A0) -> U+FFFD + // [ A0 ] -> invalid (not ASCII, not valid leading byte) -> U+FFFD + // [ 80 ] -> invalid (not ASCII, not valid leading byte) -> U+FFFD 0xE1, 0x88, 0xB4, 0xED, 0xA0, 0x80, 0x00, 0x00, }, builder.Slice(12, -132)); // the default decoder replaces bad byte sequences by U+FFFD - Assert.Equal("\u1234\ufffd\ufffd", ReadVersion(builder)); + if (PlatformDetection.IsNetCore) + { + Assert.Equal("\u1234\ufffd\ufffd\ufffd", ReadVersion(builder)); + } + else + { + // Versions of .NET prior to Core 3.0 didn't follow Unicode recommendations for U+FFFD substitution, + // so they sometimes emitted too few replacement chars. + Assert.Equal("\u1234\ufffd\ufffd", ReadVersion(builder)); + } } } } diff --git a/src/System.Runtime/tests/System.Runtime.Tests.csproj b/src/System.Runtime/tests/System.Runtime.Tests.csproj index 4b97b787c420..812406b043d9 100644 --- a/src/System.Runtime/tests/System.Runtime.Tests.csproj +++ b/src/System.Runtime/tests/System.Runtime.Tests.csproj @@ -287,9 +287,11 @@ + + @@ -338,4 +340,4 @@ - + \ No newline at end of file diff --git a/src/System.Runtime/tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.netcoreapp.cs b/src/System.Runtime/tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.netcoreapp.cs new file mode 100644 index 000000000000..fd87b575b18b --- /dev/null +++ b/src/System.Runtime/tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.netcoreapp.cs @@ -0,0 +1,255 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Globalization; +using System.Linq; +using System.Reflection; +using System.Runtime.InteropServices; +using Xunit; + +namespace System.Text.Unicode.Tests +{ + public partial class Utf16UtilityTests + { + private unsafe delegate char* GetPointerToFirstInvalidCharDel(char* pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment); + private static readonly Lazy _getPointerToFirstInvalidCharFn = CreateGetPointerToFirstInvalidCharFn(); + + [Theory] + [InlineData("", 0, 0)] // empty string is OK + [InlineData("X", 1, 1)] + [InlineData("XY", 2, 2)] + [InlineData("XYZ", 3, 3)] + [InlineData("", 1, 2)] + [InlineData("X", 2, 3)] + [InlineData("X", 2, 3)] + [InlineData("", 1, 3)] + [InlineData("", 1, 4)] + [InlineData("XZ", 3, 6)] + [InlineData("X<0000>Z", 3, 3)] // null chars are allowed + public void GetIndexOfFirstInvalidUtf16Sequence_WithSmallValidBuffers(string unprocessedInput, int expectedRuneCount, int expectedUtf8ByteCount) + { + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(unprocessedInput, -1 /* expectedIdxOfFirstInvalidChar */, expectedRuneCount, expectedUtf8ByteCount); + } + + [Theory] + [InlineData("", 0, 0, 0)] // standalone low surrogate (at beginning of sequence) + [InlineData("X", 1, 1, 1)] // standalone low surrogate (preceded by valid ASCII data) + [InlineData("", 1, 1, 3)] // standalone low surrogate (preceded by valid non-ASCII data) + [InlineData("", 0, 0, 0)] // standalone high surrogate (missing follow-up low surrogate) + [InlineData("Y", 0, 0, 0)] // standalone high surrogate (followed by ASCII char) + [InlineData("", 0, 0, 0)] // standalone high surrogate (followed by high surrogate) + [InlineData("", 0, 0, 0)] // standalone high surrogate (followed by valid non-ASCII char) + [InlineData("", 0, 0, 0)] // standalone low surrogate (not preceded by a high surrogate) + [InlineData("", 0, 0, 0)] // standalone low surrogate (not preceded by a high surrogate) + [InlineData("", 2, 1, 4)] // standalone low surrogate (preceded by a valid surrogate pair) + [InlineData("", 2, 1, 4)] // standalone low surrogate (preceded by a valid surrogate pair) + [InlineData("<0000>", 3, 2, 5)] // standalone low surrogate (preceded by a valid null char) + public void GetIndexOfFirstInvalidUtf16Sequence_WithSmallInvalidBuffers(string unprocessedInput, int idxOfFirstInvalidChar, int expectedRuneCount, int expectedUtf8ByteCount) + { + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(unprocessedInput, idxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount); + } + + [Fact] + public void GetIndexOfFirstInvalidUtf16Sequence_WithInvalidSurrogateSequences() + { + // All ASCII + + char[] chars = Enumerable.Repeat('x', 128).ToArray(); + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 128, expectedUtf8ByteCount: 128); + + // Throw a surrogate pair at the beginning + + chars[0] = '\uD800'; + chars[1] = '\uDFFF'; + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 127, expectedUtf8ByteCount: 130); + + // Throw a surrogate pair near the end + + chars[124] = '\uD800'; + chars[125] = '\uDFFF'; + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 126, expectedUtf8ByteCount: 132); + + // Throw a standalone surrogate code point at the *very* end + + chars[127] = '\uD800'; // high surrogate + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 127, expectedRuneCount: 125, expectedUtf8ByteCount: 131); + + chars[127] = '\uDFFF'; // low surrogate + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 127, expectedRuneCount: 125, expectedUtf8ByteCount: 131); + + // Make the final surrogate pair valid + + chars[126] = '\uD800'; // high surrogate + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 125, expectedUtf8ByteCount: 134); + + // Throw an invalid surrogate sequence in the middle (straddles a vector boundary) + + chars[12] = '\u0080'; // 2-byte UTF-8 sequence + chars[13] = '\uD800'; // high surrogate + chars[14] = '\uD800'; // high surrogate + chars[15] = '\uDFFF'; // low surrogate + chars[16] = '\uDFFF'; // low surrogate + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 13, expectedRuneCount: 12, expectedUtf8ByteCount: 16); + + // Correct the surrogate sequence we just added + + chars[14] = '\uDC00'; // low surrogate + chars[15] = '\uDBFF'; // high surrogate + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 123, expectedUtf8ByteCount: 139); + + // Corrupt the surrogate pair that's split across a vector boundary + + chars[16] = 'x'; // ASCII char (remember.. chars[15] is a high surrogate char) + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 15, expectedRuneCount: 13, expectedUtf8ByteCount: 20); + } + + private static void GetIndexOfFirstInvalidUtf16Sequence_Test_Core(string unprocessedInput, int expectedIdxOfFirstInvalidChar, int expectedRuneCount, long expectedUtf8ByteCount) + { + char[] processedInput = ProcessInput(unprocessedInput).ToCharArray(); + + // Run the test normally + + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount); + + // Put a bunch of ASCII data at the beginning (to test the call to ASCIIUtility at method entry) + + processedInput = Enumerable.Repeat('x', 128).Concat(processedInput).ToArray(); + + if (expectedIdxOfFirstInvalidChar >= 0) + { + expectedIdxOfFirstInvalidChar += 128; + } + expectedRuneCount += 128; + expectedUtf8ByteCount += 128; + + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount); + + // Change the first few chars to a mixture of 2-byte and 3-byte UTF-8 sequences + // This makes sure the vectorized code paths can properly handle these. + + processedInput[0] = '\u0080'; // 2-byte UTF-8 sequence + processedInput[1] = '\u0800'; // 3-byte UTF-8 sequence + processedInput[2] = '\u0080'; // 2-byte UTF-8 sequence + processedInput[3] = '\u0800'; // 3-byte UTF-8 sequence + processedInput[4] = '\u0080'; // 2-byte UTF-8 sequence + processedInput[5] = '\u0800'; // 3-byte UTF-8 sequence + processedInput[6] = '\u0080'; // 2-byte UTF-8 sequence + processedInput[7] = '\u0800'; // 3-byte UTF-8 sequence + + expectedUtf8ByteCount += 12; + + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount); + + // Throw some surrogate pairs into the mix to make sure they're also handled properly + // by the vectorized code paths. + + processedInput[8] = '\u0080'; // 2-byte UTF-8 sequence + processedInput[9] = '\u0800'; // 3-byte UTF-8 sequence + processedInput[10] = '\u0080'; // 2-byte UTF-8 sequence + processedInput[11] = '\u0800'; // 3-byte UTF-8 sequence + processedInput[12] = '\u0080'; // 2-byte UTF-8 sequence + processedInput[13] = '\uD800'; // high surrogate + processedInput[14] = '\uDC00'; // low surrogate + processedInput[15] = 'z'; // ASCII char + + expectedRuneCount--; + expectedUtf8ByteCount += 9; + + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount); + + // Split the next surrogate pair across the vector boundary (so that we + // don't inadvertently treat this as a standalone surrogate sequence). + + processedInput[15] = '\uDBFF'; // high surrogate + processedInput[16] = '\uDFFF'; // low surrogate + + expectedRuneCount--; + expectedUtf8ByteCount += 2; + + GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount); + } + + private static unsafe void GetIndexOfFirstInvalidUtf16Sequence_Test_Core(char[] input, int expectedRetVal, int expectedRuneCount, long expectedUtf8ByteCount) + { + // Arrange + + using BoundedMemory boundedMemory = BoundedMemory.AllocateFromExistingData(input); + boundedMemory.MakeReadonly(); + + // Act + + int actualRetVal; + long actualUtf8CodeUnitCount; + int actualRuneCount; + + fixed (char* pInputBuffer = &MemoryMarshal.GetReference(boundedMemory.Span)) + { + char* pFirstInvalidChar = _getPointerToFirstInvalidCharFn.Value(pInputBuffer, input.Length, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment); + + long ptrDiff = pFirstInvalidChar - pInputBuffer; + Assert.True((ulong)ptrDiff <= (uint)input.Length, "ptrDiff was outside expected range."); + + Assert.True(utf8CodeUnitCountAdjustment >= 0, "UTF-16 code unit count adjustment must be non-negative."); + Assert.True(scalarCountAdjustment <= 0, "Scalar count adjustment must be 0 or negative."); + + actualRetVal = (ptrDiff == input.Length) ? -1 : (int)ptrDiff; + + // The last two 'out' parameters are: + // a) The number to be added to the "chars processed" return value to come up with the total UTF-8 code unit count, and + // b) The number to be added to the "total UTF-16 code unit count" value to come up with the total scalar count. + + actualUtf8CodeUnitCount = ptrDiff + utf8CodeUnitCountAdjustment; + actualRuneCount = (int)ptrDiff + scalarCountAdjustment; + } + + // Assert + + Assert.Equal(expectedRetVal, actualRetVal); + Assert.Equal(expectedRuneCount, actualRuneCount); + Assert.Equal(actualUtf8CodeUnitCount, expectedUtf8ByteCount); + } + + private static Lazy CreateGetPointerToFirstInvalidCharFn() + { + return new Lazy(() => + { + Type utf16UtilityType = typeof(Utf8).Assembly.GetType("System.Text.Unicode.Utf16Utility"); + + if (utf16UtilityType is null) + { + throw new Exception("Couldn't find Utf16Utility type in System.Private.CoreLib."); + } + + MethodInfo methodInfo = utf16UtilityType.GetMethod("GetPointerToFirstInvalidChar", BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic); + + if (methodInfo is null) + { + throw new Exception("Couldn't find GetPointerToFirstInvalidChar method on Utf8Utility."); + } + + return (GetPointerToFirstInvalidCharDel)methodInfo.CreateDelegate(typeof(GetPointerToFirstInvalidCharDel)); + }); + } + + private static string ProcessInput(string input) + { + input = input.Replace("", "\u00E9", StringComparison.Ordinal); // U+00E9 LATIN SMALL LETTER E WITH ACUTE + input = input.Replace("", "\u20AC", StringComparison.Ordinal); // U+20AC EURO SIGN + input = input.Replace("", "\U0001F600", StringComparison.Ordinal); // U+1F600 GRINNING FACE + + // Replace with \uABCD. This allows us to flow potentially malformed + // UTF-16 strings without Xunit. (The unit testing framework gets angry when + // we try putting invalid UTF-16 data as inline test data.) + + int idx; + while ((idx = input.IndexOf('<')) >= 0) + { + input = input[..idx] + (char)ushort.Parse(input.Substring(idx + 1, 4), NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture) + input[idx + 6..]; + } + + return input; + } + } +} diff --git a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToBytes.netcoreapp.cs b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToBytes.netcoreapp.cs index 18ceedc2f832..5432da089bdb 100644 --- a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToBytes.netcoreapp.cs +++ b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToBytes.netcoreapp.cs @@ -119,6 +119,34 @@ public void ToBytes_WithLargeValidBuffers(string utf16Input) expectedNumCharsRead: expectedNumCharsConsumed, expectedUtf8Transcoding: concatenatedUtf8); } + + // now throw lots of ASCII data at the beginning so that we exercise the vectorized code paths + + utf16Input = new string('x', 64) + utf16Input; + concatenatedUtf8 = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray(); + + ToBytes_Test_Core( + utf16Input: utf16Input, + destinationSize: concatenatedUtf8.Length, + replaceInvalidSequences: false, + isFinalChunk: true, + expectedOperationStatus: OperationStatus.Done, + expectedNumCharsRead: utf16Input.Length, + expectedUtf8Transcoding: concatenatedUtf8); + + // now throw some non-ASCII data at the beginning so that we *don't* exercise the vectorized code paths + + utf16Input = WOMAN_CARTWHEELING_MEDSKIN_UTF16 + utf16Input[64..]; + concatenatedUtf8 = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray(); + + ToBytes_Test_Core( + utf16Input: utf16Input, + destinationSize: concatenatedUtf8.Length, + replaceInvalidSequences: false, + isFinalChunk: true, + expectedOperationStatus: OperationStatus.Done, + expectedNumCharsRead: utf16Input.Length, + expectedUtf8Transcoding: concatenatedUtf8); } [Theory] @@ -162,6 +190,18 @@ public void ToBytes_WithInvalidSurrogates(string utf16Input, int expectedNumChar expectedOperationStatus: OperationStatus.InvalidData, expectedNumCharsRead: expectedNumCharsConsumed, expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex)); + + // Now try the tests again with a larger buffer. + // This ensures that running out of destination space wasn't the reason we failed. + + ToBytes_Test_Core( + utf16Input: utf16Input, + destinationSize: (expectedUtf8TranscodingHex.Length) / 2 + 16, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.InvalidData, + expectedNumCharsRead: expectedNumCharsConsumed, + expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex)); } [Theory] diff --git a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToChars.netcoreapp.cs b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToChars.netcoreapp.cs index 6dda95dffc10..cb3933891ce0 100644 --- a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToChars.netcoreapp.cs +++ b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToChars.netcoreapp.cs @@ -42,6 +42,18 @@ public void ToChars_WithSmallInvalidBuffers(string utf8HexInput, int expectedNum expectedOperationStatus: OperationStatus.InvalidData, expectedNumBytesRead: expectedNumBytesConsumed, expectedUtf16Transcoding: expectedUtf16Transcoding); + + // Now try the tests again with a larger buffer. + // This ensures that running out of destination space wasn't the reason we failed. + + ToChars_Test_Core( + utf8Input: DecodeHex(utf8HexInput), + destinationSize: expectedUtf16Transcoding.Length + 16, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.InvalidData, + expectedNumBytesRead: expectedNumBytesConsumed, + expectedUtf16Transcoding: expectedUtf16Transcoding); } [Theory] @@ -74,6 +86,18 @@ public void ToChars_WithVariousIncompleteBuffers(string utf8HexInput, int expect expectedOperationStatus: OperationStatus.NeedMoreData, expectedNumBytesRead: expectedNumBytesConsumed, expectedUtf16Transcoding: expectedUtf16Transcoding); + + // Now try the tests again with a larger buffer. + // This ensures that running out of destination space wasn't the reason we failed. + + ToChars_Test_Core( + utf8Input: DecodeHex(utf8HexInput), + destinationSize: expectedUtf16Transcoding.Length + 16, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.NeedMoreData, + expectedNumBytesRead: expectedNumBytesConsumed, + expectedUtf16Transcoding: expectedUtf16Transcoding); } [Theory] @@ -104,7 +128,7 @@ public void ToChars_WithVariousIncompleteBuffers(string utf8HexInput, int expect [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16)] // 2x 3-byte sequences + 4x 2-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing [InlineData(GRINNING_FACE_UTF16 + GRINNING_FACE_UTF16)] // 2x 4-byte sequences, exercises 4-byte sequence processing [InlineData(GRINNING_FACE_UTF16 + "@AB")] // single 4-byte sequence + 3 ASCII bytes, exercises 4-byte sequence processing and draining logic - [InlineData("\U0001F938\U0001F3FD\u200D\u2640\uFE0F")] // U+1F938 U+1F3FD U+200D U+2640 U+FE0F WOMAN CARTWHEELING: MEDIUM SKIN TONE, exercising switching between multiple sequence lengths + [InlineData(WOMAN_CARTWHEELING_MEDSKIN_UTF16)] // exercises switching between multiple sequence lengths public void ToChars_ValidBuffers(string utf16Input) { // We're going to run the tests with destination buffer lengths ranging from 0 all the way @@ -162,6 +186,34 @@ public void ToChars_ValidBuffers(string utf16Input) expectedNumBytesRead: expectedNumBytesConsumed, expectedUtf16Transcoding: concatenatedUtf16); } + + // now throw lots of ASCII data at the beginning so that we exercise the vectorized code paths + + utf16Input = new string('x', 64) + utf16Input; + utf8Input = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray(); + + ToChars_Test_Core( + utf8Input: utf8Input, + destinationSize: utf16Input.Length, + replaceInvalidSequences: false, + isFinalChunk: true, + expectedOperationStatus: OperationStatus.Done, + expectedNumBytesRead: utf8Input.Length, + expectedUtf16Transcoding: utf16Input); + + // now throw some non-ASCII data at the beginning so that we *don't* exercise the vectorized code paths + + utf16Input = WOMAN_CARTWHEELING_MEDSKIN_UTF16 + utf16Input[64..]; + utf8Input = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray(); + + ToChars_Test_Core( + utf8Input: utf8Input, + destinationSize: utf16Input.Length, + replaceInvalidSequences: false, + isFinalChunk: true, + expectedOperationStatus: OperationStatus.Done, + expectedNumBytesRead: utf8Input.Length, + expectedUtf16Transcoding: utf16Input); } [Theory] @@ -182,6 +234,7 @@ public void ToChars_ValidBuffers(string utf16Input) [InlineData("3031" + "E17F80" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Improperly terminated 3-byte sequence at start of DWORD [InlineData("3031" + "E1C080" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Improperly terminated 3-byte sequence at start of DWORD [InlineData("3031" + "EDA080" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Surrogate 3-byte sequence at start of DWORD + [InlineData("3031" + "E69C88" + "E59B" + "E69C88", 5, "01\u6708")] // Incomplete 3-byte sequence surrounded by valid 3-byte sequences [InlineData("3031" + "F5808080", 2, "01")] // [ F5 ] is always invalid [InlineData("3031" + "F6808080", 2, "01")] // [ F6 ] is always invalid [InlineData("3031" + "F7808080", 2, "01")] // [ F7 ] is always invalid @@ -208,6 +261,18 @@ public void ToChars_WithLargeInvalidBuffers(string utf8HexInput, int expectedNum expectedOperationStatus: OperationStatus.InvalidData, expectedNumBytesRead: expectedNumBytesConsumed, expectedUtf16Transcoding: expectedUtf16Transcoding); + + // Now try the tests again with a larger buffer. + // This ensures that running out of destination space wasn't the reason we failed. + + ToChars_Test_Core( + utf8Input: DecodeHex(utf8HexInput), + destinationSize: expectedUtf16Transcoding.Length + 16, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.InvalidData, + expectedNumBytesRead: expectedNumBytesConsumed, + expectedUtf16Transcoding: expectedUtf16Transcoding); } [Theory] diff --git a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.netcoreapp.cs b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.netcoreapp.cs index 087235a81b74..f57c769c3697 100644 --- a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.netcoreapp.cs +++ b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.netcoreapp.cs @@ -33,7 +33,9 @@ public partial class Utf8Tests private const string GRINNING_FACE_UTF8 = "F09F9880"; // U+1F600 GRINNING FACE, 4 bytes private const string GRINNING_FACE_UTF16 = "\U0001F600"; - + + private const string WOMAN_CARTWHEELING_MEDSKIN_UTF16 = "\U0001F938\U0001F3FD\u200D\u2640\uFE0F"; // U+1F938 U+1F3FD U+200D U+2640 U+FE0F WOMAN CARTWHEELING: MEDIUM SKIN TONE + // All valid scalars [ U+0000 .. U+D7FF ] and [ U+E000 .. U+10FFFF ]. private static readonly IEnumerable s_allValidScalars = Enumerable.Range(0x0000, 0xD800).Concat(Enumerable.Range(0xE000, 0x110000 - 0xE000)).Select(value => new Rune(value)); @@ -59,7 +61,7 @@ static Utf8Tests() * COMMON UTILITIES FOR UNIT TESTS */ - private static byte[] DecodeHex(ReadOnlySpan inputHex) + public static byte[] DecodeHex(ReadOnlySpan inputHex) { Assert.True(Regex.IsMatch(inputHex.ToString(), "^([0-9a-fA-F]{2})*$"), "Input must be an even number of hex characters."); @@ -74,7 +76,7 @@ private static byte[] DecodeHex(ReadOnlySpan inputHex) // !! IMPORTANT !! // Don't delete this implementation, as we use it as a reference to make sure the framework's // transcoding logic is correct. - private static byte[] ToUtf8(Rune rune) + public static byte[] ToUtf8(Rune rune) { Assert.True(Rune.IsValid(rune.Value), $"Rune with value U+{(uint)rune.Value:X4} is not well-formed."); diff --git a/src/System.Runtime/tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.netcoreapp.cs b/src/System.Runtime/tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.netcoreapp.cs new file mode 100644 index 000000000000..899faa86ce3d --- /dev/null +++ b/src/System.Runtime/tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.netcoreapp.cs @@ -0,0 +1,417 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Linq; +using System.Reflection; +using System.Runtime.InteropServices; +using Xunit; + +namespace System.Text.Unicode.Tests +{ + public partial class Utf8UtilityTests + { + private unsafe delegate byte* GetPointerToFirstInvalidByteDel(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); + private static readonly Lazy _getPointerToFirstInvalidByteFn = CreateGetPointerToFirstInvalidByteFn(); + + private const string X = "58"; // U+0058 LATIN CAPITAL LETTER X, 1 byte + private const string Y = "59"; // U+0058 LATIN CAPITAL LETTER Y, 1 byte + private const string Z = "5A"; // U+0058 LATIN CAPITAL LETTER Z, 1 byte + private const string E_ACUTE = "C3A9"; // U+00E9 LATIN SMALL LETTER E WITH ACUTE, 2 bytes + private const string EURO_SYMBOL = "E282AC"; // U+20AC EURO SIGN, 3 bytes + private const string GRINNING_FACE = "F09F9880"; // U+1F600 GRINNING FACE, 4 bytes + + [Theory] + [InlineData("", 0, 0)] // empty string is OK + [InlineData(X, 1, 0)] + [InlineData(X + Y, 2, 0)] + [InlineData(X + Y + Z, 3, 0)] + [InlineData(E_ACUTE, 1, 0)] + [InlineData(X + E_ACUTE, 2, 0)] + [InlineData(E_ACUTE + X, 2, 0)] + [InlineData(EURO_SYMBOL, 1, 0)] + public void GetIndexOfFirstInvalidUtf8Sequence_WithSmallValidBuffers(string input, int expectedRuneCount, int expectedSurrogatePairCount) + { + // These test cases are for the "slow processing" code path at the end of GetIndexOfFirstInvalidUtf8Sequence, + // so inputs should be less than 4 bytes. + + Assert.InRange(input.Length, 0, 6); + + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, -1 /* expectedRetVal */, expectedRuneCount, expectedSurrogatePairCount); + } + + [Theory] + [InlineData("80", 0, 0, 0)] // sequence cannot begin with continuation character + [InlineData("8182", 0, 0, 0)] // sequence cannot begin with continuation character + [InlineData("838485", 0, 0, 0)] // sequence cannot begin with continuation character + [InlineData(X + "80", 1, 1, 0)] // sequence cannot begin with continuation character + [InlineData(X + "8182", 1, 1, 0)] // sequence cannot begin with continuation character + [InlineData("C0", 0, 0, 0)] // [ C0 ] is always invalid + [InlineData("C080", 0, 0, 0)] // [ C0 ] is always invalid + [InlineData("C08081", 0, 0, 0)] // [ C0 ] is always invalid + [InlineData(X + "C1", 1, 1, 0)] // [ C1 ] is always invalid + [InlineData(X + "C180", 1, 1, 0)] // [ C1 ] is always invalid + [InlineData("C2", 0, 0, 0)] // [ C2 ] is improperly terminated + [InlineData(X + "C27F", 1, 1, 0)] // [ C2 ] is improperly terminated + [InlineData(X + "E282", 1, 1, 0)] // [ E2 82 ] is improperly terminated + [InlineData("E2827F", 0, 0, 0)] // [ E2 82 ] is improperly terminated + [InlineData("E09F80", 0, 0, 0)] // [ E0 9F ... ] is overlong + [InlineData("E0C080", 0, 0, 0)] // [ E0 ] is improperly terminated + [InlineData("ED7F80", 0, 0, 0)] // [ ED ] is improperly terminated + [InlineData("EDA080", 0, 0, 0)] // [ ED A0 ... ] is surrogate + public void GetIndexOfFirstInvalidUtf8Sequence_WithSmallInvalidBuffers(string input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount) + { + // These test cases are for the "slow processing" code path at the end of GetIndexOfFirstInvalidUtf8Sequence, + // so inputs should be less than 4 bytes. + + Assert.InRange(input.Length, 0, 6); + + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, expectedRetVal, expectedRuneCount, expectedSurrogatePairCount); + } + + [Theory] + [InlineData(E_ACUTE + "21222324" + "303132333435363738393A3B3C3D3E3F", 21, 0)] // Loop unrolling at end of buffer + [InlineData(E_ACUTE + "21222324" + "303132333435363738393A3B3C3D3E3F" + "3031323334353637" + E_ACUTE + "38393A3B3C3D3E3F", 38, 0)] // Loop unrolling interrupted by non-ASCII + [InlineData("212223" + E_ACUTE + "30313233", 8, 0)] // 3 ASCII bytes followed by non-ASCII + [InlineData("2122" + E_ACUTE + "30313233", 7, 0)] // 2 ASCII bytes followed by non-ASCII + [InlineData("21" + E_ACUTE + "30313233", 6, 0)] // 1 ASCII byte followed by non-ASCII + [InlineData(E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE, 4, 0)] // 4x 2-byte sequences, exercises optimization code path in 2-byte sequence processing + [InlineData(E_ACUTE + E_ACUTE + E_ACUTE + "5051", 5, 0)] // 3x 2-byte sequences + 2 ASCII bytes, exercises optimization code path in 2-byte sequence processing + [InlineData(E_ACUTE + "5051", 3, 0)] // single 2-byte sequence + 2 trailing ASCII bytes, exercises draining logic in 2-byte sequence processing + [InlineData(E_ACUTE + "50" + E_ACUTE + "304050", 6, 0)] // single 2-byte sequences + 1 trailing ASCII byte + 2-byte sequence, exercises draining logic in 2-byte sequence processing + [InlineData(EURO_SYMBOL + "20", 2, 0)] // single 3-byte sequence + 1 trailing ASCII byte, exercises draining logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL + "203040", 4, 0)] // single 3-byte sequence + 3 trailing ASCII byte, exercises draining logic and "running out of data" logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL, 3, 0)] // 3x 3-byte sequences, exercises "stay within 3-byte loop" logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL, 4, 0)] // 4x 3-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + E_ACUTE, 4, 0)] // 3x 3-byte sequences + single 2-byte sequence, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL + EURO_SYMBOL + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE, 6, 0)] // 2x 3-byte sequences + 4x 2-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing + [InlineData(GRINNING_FACE + GRINNING_FACE, 2, 2)] // 2x 4-byte sequences, exercises 4-byte sequence processing + [InlineData(GRINNING_FACE + "303132", 4, 1)] // single 4-byte sequence + 3 ASCII bytes, exercises 4-byte sequence processing and draining logic + [InlineData("F09FA4B8" + "F09F8FBD" + "E2808D" + "E29980" + "EFB88F", 5, 2)] // U+1F938 U+1F3FD U+200D U+2640 U+FE0F WOMAN CARTWHEELING: MEDIUM SKIN TONE, exercising switching between multiple sequence lengths + public void GetIndexOfFirstInvalidUtf8Sequence_WithLargeValidBuffers(string input, int expectedRuneCount, int expectedSurrogatePairCount) + { + // These test cases are for the "fast processing" code which is the main loop of GetIndexOfFirstInvalidUtf8Sequence, + // so inputs should be less >= 4 bytes. + + Assert.True(input.Length >= 8); + + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, -1 /* expectedRetVal */, expectedRuneCount, expectedSurrogatePairCount); + } + + [Theory] + [InlineData("3031" + "80" + "202122232425", 2, 2, 0)] // Continuation character at start of sequence should match no bitmask + [InlineData("3031" + "C080" + "2021222324", 2, 2, 0)] // Overlong 2-byte sequence at start of DWORD + [InlineData("3031" + "C180" + "2021222324", 2, 2, 0)] // Overlong 2-byte sequence at start of DWORD + [InlineData("C280" + "C180", 2, 1, 0)] // Overlong 2-byte sequence at end of DWORD + [InlineData("C27F" + "C280", 0, 0, 0)] // Improperly terminated 2-byte sequence at start of DWORD + [InlineData("C2C0" + "C280", 0, 0, 0)] // Improperly terminated 2-byte sequence at start of DWORD + [InlineData("C280" + "C27F", 2, 1, 0)] // Improperly terminated 2-byte sequence at end of DWORD + [InlineData("C280" + "C2C0", 2, 1, 0)] // Improperly terminated 2-byte sequence at end of DWORD + [InlineData("C280" + "C280" + "80203040", 4, 2, 0)] // Continuation character at start of sequence, within "stay in 2-byte processing" optimization + [InlineData("C280" + "C280" + "C180" + "C280", 4, 2, 0)] // Overlong 2-byte sequence at start of DWORD, within "stay in 2-byte processing" optimization + [InlineData("C280" + "C280" + "C280" + "C180", 6, 3, 0)] // Overlong 2-byte sequence at end of DWORD, within "stay in 2-byte processing" optimization + [InlineData("3031" + "E09F80" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Overlong 3-byte sequence at start of DWORD + [InlineData("3031" + "E07F80" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD + [InlineData("3031" + "E0C080" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD + [InlineData("3031" + "E17F80" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD + [InlineData("3031" + "E1C080" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD + [InlineData("3031" + "EDA080" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Surrogate 3-byte sequence at start of DWORD + [InlineData("3031" + "E69C88" + "E59B" + "E69C88", 5, 3, 0)] // Incomplete 3-byte sequence surrounded by valid 3-byte sequences + [InlineData("3031" + "F5808080", 2, 2, 0)] // [ F5 ] is always invalid + [InlineData("3031" + "F6808080", 2, 2, 0)] // [ F6 ] is always invalid + [InlineData("3031" + "F7808080", 2, 2, 0)] // [ F7 ] is always invalid + [InlineData("3031" + "F8808080", 2, 2, 0)] // [ F8 ] is always invalid + [InlineData("3031" + "F9808080", 2, 2, 0)] // [ F9 ] is always invalid + [InlineData("3031" + "FA808080", 2, 2, 0)] // [ FA ] is always invalid + [InlineData("3031" + "FB808080", 2, 2, 0)] // [ FB ] is always invalid + [InlineData("3031" + "FC808080", 2, 2, 0)] // [ FC ] is always invalid + [InlineData("3031" + "FD808080", 2, 2, 0)] // [ FD ] is always invalid + [InlineData("3031" + "FE808080", 2, 2, 0)] // [ FE ] is always invalid + [InlineData("3031" + "FF808080", 2, 2, 0)] // [ FF ] is always invalid + public void GetIndexOfFirstInvalidUtf8Sequence_WithLargeInvalidBuffers(string input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount) + { + // These test cases are for the "fast processing" code which is the main loop of GetIndexOfFirstInvalidUtf8Sequence, + // so inputs should be less >= 4 bytes. + + Assert.True(input.Length >= 8); + + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, expectedRetVal, expectedRuneCount, expectedSurrogatePairCount); + } + + [Fact] + public void GetIndexOfFirstInvalidUtf8Sequence_WithOverlongTwoByteSequences_ReturnsInvalid() + { + // [ C0 ] is never a valid byte, indicates overlong 2-byte sequence + // We'll test that [ C0 ] [ 00..FF ] is treated as invalid + + for (int i = 0; i < 256; i++) + { + AssertIsInvalidTwoByteSequence(new byte[] { 0xC0, (byte)i }); + } + + // [ C1 ] is never a valid byte, indicates overlong 2-byte sequence + // We'll test that [ C1 ] [ 00..FF ] is treated as invalid + + for (int i = 0; i < 256; i++) + { + AssertIsInvalidTwoByteSequence(new byte[] { 0xC1, (byte)i }); + } + } + + [Fact] + public void GetIndexOfFirstInvalidUtf8Sequence_WithImproperlyTerminatedTwoByteSequences_ReturnsInvalid() + { + // Test [ C2..DF ] [ 00..7F ] and [ C2..DF ] [ C0..FF ] + + for (int i = 0xC2; i < 0xDF; i++) + { + for (int j = 0; j < 0x80; j++) + { + AssertIsInvalidTwoByteSequence(new byte[] { (byte)i, (byte)j }); + } + for (int j = 0xC0; j < 0x100; j++) + { + AssertIsInvalidTwoByteSequence(new byte[] { (byte)i, (byte)j }); + } + } + } + + [Fact] + public void GetIndexOfFirstInvalidUtf8Sequence_WithOverlongThreeByteSequences_ReturnsInvalid() + { + // [ E0 ] [ 80..9F ] [ 80..BF ] is overlong 3-byte sequence + + for (int i = 0x00; i < 0xA0; i++) + { + AssertIsInvalidThreeByteSequence(new byte[] { 0xE0, (byte)i, 0x80 }); + } + } + + [Fact] + public void GetIndexOfFirstInvalidUtf8Sequence_WithSurrogateThreeByteSequences_ReturnsInvalid() + { + // [ ED ] [ A0..BF ] [ 80..BF ] is surrogate 3-byte sequence + + for (int i = 0xA0; i < 0x100; i++) + { + AssertIsInvalidThreeByteSequence(new byte[] { 0xED, (byte)i, 0x80 }); + } + } + + [Fact] + public void GetIndexOfFirstInvalidUtf8Sequence_WithImproperlyTerminatedThreeByteSequence_ReturnsInvalid() + { + // [ E0..EF ] [ 80..BF ] [ !(80..BF) ] is improperly terminated 3-byte sequence + + for (int i = 0xE0; i < 0xF0; i++) + { + for (int j = 0x00; j < 0x80; j++) + { + // Use both '9F' and 'A0' to make sure at least one isn't caught by overlong / surrogate checks + AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0x9F, (byte)j }); + AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0xA0, (byte)j }); + } + for (int j = 0xC0; j < 0x100; j++) + { + // Use both '9F' and 'A0' to make sure at least one isn't caught by overlong / surrogate checks + AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0x9F, (byte)j }); + AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0xA0, (byte)j }); + } + } + } + + [Fact] + public void GetIndexOfFirstInvalidUtf8Sequence_WithOverlongFourByteSequences_ReturnsInvalid() + { + // [ F0 ] [ 80..8F ] [ 80..BF ] [ 80..BF ] is overlong 4-byte sequence + + for (int i = 0x00; i < 0x90; i++) + { + AssertIsInvalidFourByteSequence(new byte[] { 0xF0, (byte)i, 0x80, 0x80 }); + } + } + + [Fact] + public void GetIndexOfFirstInvalidUtf8Sequence_WithOutOfRangeFourByteSequences_ReturnsInvalid() + { + // [ F4 ] [ 90..BF ] [ 80..BF ] [ 80..BF ] is out-of-range 4-byte sequence + + for (int i = 0x90; i < 0x100; i++) + { + AssertIsInvalidFourByteSequence(new byte[] { 0xF4, (byte)i, 0x80, 0x80 }); + } + } + + [Fact] + public void GetIndexOfFirstInvalidUtf8Sequence_WithInvalidFourByteSequence_ReturnsInvalid() + { + // [ F0..F4 ] [ !(80..BF) ] [ !(80..BF) ] [ !(80..BF) ] is improperly terminated 4-byte sequence + + for (int i = 0xF0; i < 0xF5; i++) + { + for (int j = 0x00; j < 0x80; j++) + { + AssertIsInvalidFourByteSequence(new byte[] { (byte)i, (byte)j, 0x80, 0x80 }); + + // Use both '8F' and '90' to make sure at least one isn't caught by overlong / out-of-range checks + AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, (byte)j, 0x80 }); + AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, (byte)j, 0x80 }); + + AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, 0x80, (byte)j }); + AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, 0x80, (byte)j }); + } + for (int j = 0xC0; j < 0x100; j++) + { + AssertIsInvalidFourByteSequence(new byte[] { (byte)i, (byte)j, 0x80, 0x80 }); + + // Use both '8F' and '90' to make sure at least one isn't caught by overlong / out-of-range checks + AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, (byte)j, 0x80 }); + AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, (byte)j, 0x80 }); + + AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, 0x80, (byte)j }); + AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, 0x80, (byte)j }); + } + } + } + + private static void AssertIsInvalidTwoByteSequence(byte[] invalidSequence) + { + Assert.Equal(2, invalidSequence.Length); + + byte[] knownGoodBytes = Utf8Tests.DecodeHex(E_ACUTE); + + byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of first DWORD + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0); + + toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of first DWORD + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 2, 1, 0); + + // Run the same tests but with extra data at the beginning so that we're inside one of + // the 2-byte processing "hot loop" code paths. + + toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of next DWORD + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 4, 2, 0); + + toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of next DWORD + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 6, 3, 0); + } + + private static void AssertIsInvalidThreeByteSequence(byte[] invalidSequence) + { + Assert.Equal(3, invalidSequence.Length); + + byte[] knownGoodBytes = Utf8Tests.DecodeHex(EURO_SYMBOL); + + byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of first DWORD + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0); + + // Run the same tests but with extra data at the beginning so that we're inside one of + // the 3-byte processing "hot loop" code paths. + + toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // straddling first and second DWORDs + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 3, 1, 0); + + toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // straddling second and third DWORDs + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 6, 2, 0); + + toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of third DWORD + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 9, 3, 0); + } + + private static void AssertIsInvalidFourByteSequence(byte[] invalidSequence) + { + Assert.Equal(4, invalidSequence.Length); + + byte[] knownGoodBytes = Utf8Tests.DecodeHex(GRINNING_FACE); + + byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0); + + toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 4, 1, 1); + } + + private static void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(string inputHex, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount) + { + byte[] inputBytes = Utf8Tests.DecodeHex(inputHex); + + // Run the test normally + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(inputBytes, expectedRetVal, expectedRuneCount, expectedSurrogatePairCount); + + // Then run the test with a bunch of ASCII data at the beginning (to exercise the vectorized code paths) + inputBytes = Enumerable.Repeat((byte)'x', 128).Concat(inputBytes).ToArray(); + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(inputBytes, (expectedRetVal < 0) ? expectedRetVal : (expectedRetVal + 128), expectedRuneCount + 128, expectedSurrogatePairCount); + + // Then put a few more ASCII bytes at the beginning (to test that offsets are properly handled) + inputBytes = Enumerable.Repeat((byte)'x', 7).Concat(inputBytes).ToArray(); + GetIndexOfFirstInvalidUtf8Sequence_Test_Core(inputBytes, (expectedRetVal < 0) ? expectedRetVal : (expectedRetVal + 135), expectedRuneCount + 135, expectedSurrogatePairCount); + } + + private static unsafe void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(byte[] input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount) + { + // Arrange + + using BoundedMemory boundedMemory = BoundedMemory.AllocateFromExistingData(input); + boundedMemory.MakeReadonly(); + + // Act + + int actualRetVal; + int actualSurrogatePairCount; + int actualRuneCount; + + fixed (byte* pInputBuffer = &MemoryMarshal.GetReference(boundedMemory.Span)) + { + byte* pFirstInvalidByte = _getPointerToFirstInvalidByteFn.Value(pInputBuffer, input.Length, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); + + long ptrDiff = pFirstInvalidByte - pInputBuffer; + Assert.True((ulong)ptrDiff <= (uint)input.Length, "ptrDiff was outside expected range."); + + Assert.True(utf16CodeUnitCountAdjustment <= 0, "UTF-16 code unit count adjustment must be 0 or negative."); + Assert.True(scalarCountAdjustment <= 0, "Scalar count adjustment must be 0 or negative."); + + actualRetVal = (ptrDiff == input.Length) ? -1 : (int)ptrDiff; + + // The last two 'out' parameters are: + // a) The number to be added to the "bytes processed" return value to come up with the total UTF-16 code unit count, and + // b) The number to be added to the "total UTF-16 code unit count" value to come up with the total scalar count. + + int totalUtf16CodeUnitCount = (int)ptrDiff + utf16CodeUnitCountAdjustment; + actualRuneCount = totalUtf16CodeUnitCount + scalarCountAdjustment; + + // Surrogate pair count is number of UTF-16 code units less the number of scalars. + + actualSurrogatePairCount = totalUtf16CodeUnitCount - actualRuneCount; + } + + // Assert + + Assert.Equal(expectedRetVal, actualRetVal); + Assert.Equal(expectedRuneCount, actualRuneCount); + Assert.Equal(expectedSurrogatePairCount, actualSurrogatePairCount); + } + + private static Lazy CreateGetPointerToFirstInvalidByteFn() + { + return new Lazy(() => + { + Type utf8UtilityType = typeof(Utf8).Assembly.GetType("System.Text.Unicode.Utf8Utility"); + + if (utf8UtilityType is null) + { + throw new Exception("Couldn't find Utf8Utility type in System.Private.CoreLib."); + } + + MethodInfo methodInfo = utf8UtilityType.GetMethod("GetPointerToFirstInvalidByte", BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic); + + if (methodInfo is null) + { + throw new Exception("Couldn't find GetPointerToFirstInvalidByte method on Utf8Utility."); + } + + return (GetPointerToFirstInvalidByteDel)methodInfo.CreateDelegate(typeof(GetPointerToFirstInvalidByteDel)); + }); + } + } +} diff --git a/src/System.Text.Encoding/tests/NegativeEncodingTests.cs b/src/System.Text.Encoding/tests/NegativeEncodingTests.cs index 2718f6f5287c..e5ae72d3cf40 100644 --- a/src/System.Text.Encoding/tests/NegativeEncodingTests.cs +++ b/src/System.Text.Encoding/tests/NegativeEncodingTests.cs @@ -45,7 +45,14 @@ public static IEnumerable Encodings_TestData() public static unsafe void GetByteCount_Invalid(Encoding encoding) { // Chars is null - AssertExtensions.Throws(encoding is ASCIIEncoding ? "chars" : "s", () => encoding.GetByteCount((string)null)); + if (PlatformDetection.IsNetCore) + { + AssertExtensions.Throws((encoding is ASCIIEncoding || encoding is UTF8Encoding) ? "chars" : "s", () => encoding.GetByteCount((string)null)); + } + else + { + AssertExtensions.Throws((encoding is ASCIIEncoding) ? "chars" : "s", () => encoding.GetByteCount((string)null)); + } AssertExtensions.Throws("chars", () => encoding.GetByteCount((char[])null)); AssertExtensions.Throws("chars", () => encoding.GetByteCount((char[])null, 0, 0)); diff --git a/src/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingDecode.cs b/src/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingDecode.cs index 618858cbc559..03577e7fffaa 100644 --- a/src/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingDecode.cs +++ b/src/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingDecode.cs @@ -111,7 +111,7 @@ public void Decode(byte[] bytes, int index, int count, string expected) EncodingHelpers.Decode(new UTF8Encoding(false, true), bytes, index, count, expected); EncodingHelpers.Decode(new UTF8Encoding(true, true), bytes, index, count, expected); } - + public static IEnumerable Decode_InvalidBytes_TestData() { yield return new object[] { new byte[] { 196, 84, 101, 115, 116, 196, 196, 196, 176, 176, 84, 101, 115, 116, 176 }, 0, 15, "\uFFFDTest\uFFFD\uFFFD\u0130\uFFFDTest\uFFFD" }; @@ -126,97 +126,217 @@ public static IEnumerable Decode_InvalidBytes_TestData() yield return new object[] { validSurrogateBytes, 2, 2, "\uFFFD\uFFFD" }; yield return new object[] { validSurrogateBytes, 2, 1, "\uFFFD" }; - yield return new object[] { new byte[] { 0xED, 0xA0, 0x80 }, 0, 3, "\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xED, 0xAF, 0xBF }, 0, 3, "\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xED, 0xB0, 0x80 }, 0, 3, "\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xED, 0xBF, 0xBF }, 0, 3, "\uFFFD\uFFFD" }; - - // Invalid surrogate pair (low/low, high/high, low/high) - yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xED, 0xAF, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xED, 0xB0, 0x80, 0xED, 0xB0, 0x80 }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xED, 0xA0, 0x80 }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD" }; - - // Too high scalar value in surrogates - yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xEE, 0x80, 0x80 }, 0, 6, "\uFFFD\uFFFD\uE000" }; - yield return new object[] { new byte[] { 0xF4, 0x90, 0x80, 0x80 }, 0, 4, "\uFFFD\uFFFD\uFFFD" }; - - // These are examples of overlong sequences. This can cause security - // vulnerabilities (e.g. MS00-078) so it is important we parse these as invalid. - yield return new object[] { new byte[] { 0xC0 }, 0, 1, "\uFFFD" }; - yield return new object[] { new byte[] { 0xC0, 0xAF }, 0, 2, "\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xE0, 0x80, 0xBF }, 0, 3, "\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xF0, 0x80, 0x80, 0xBF }, 0, 4, "\uFFFD\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xF8, 0x80, 0x80, 0x80, 0xBF }, 0, 5, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xFC, 0x80, 0x80, 0x80, 0x80, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; - - yield return new object[] { new byte[] { 0xC0, 0xBF }, 0, 2, "\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xE0, 0x9C, 0x90 }, 0, 3, "\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xF0, 0x8F, 0xA4, 0x80 }, 0, 4, "\uFFFD\uFFFD\uFFFD" }; - - yield return new object[] { new byte[] { 0xEF, 0x41 }, 0, 2, "\uFFFD\u0041" }; - yield return new object[] { new byte[] { 0xEF, 0xBF, 0xAE }, 0, 1, "\uFFFD" }; - yield return new object[] { new byte[] { 0xEF, 0xBF, 0x41 }, 0, 3, "\uFFFD\u0041" }; - yield return new object[] { new byte[] { 0xEF, 0xBF, 0x61 }, 0, 3, "\uFFFD\u0061" }; - yield return new object[] { new byte[] { 0xEF, 0xBF, 0xEF, 0xBF, 0xAE }, 0, 5, "\uFFFD\uFFEE" }; - yield return new object[] { new byte[] { 0xEF, 0xBF, 0xC0, 0xBF }, 0, 4, "\uFFFD\uFFFD\uFFFD" }; - - yield return new object[] { new byte[] { 0xF0, 0xC4, 0x80 }, 0, 3, "\uFFFD\u0100" }; - - yield return new object[] { new byte[] { 176 }, 0, 1, "\uFFFD" }; - yield return new object[] { new byte[] { 196 }, 0, 1, "\uFFFD" }; - - yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0x52, 0x7C, 0x7B, 0x41, 0x6E, 0x47, 0x65, 0xA3, 0xA4 }, 0, 12, "\uFFFD\uFFFD\u0061\u0052\u007C\u007B\u0041\u006E\u0047\u0065\uFFFD\uFFFD" }; - - yield return new object[] { new byte[] { 0xA3 }, 0, 1, "\uFFFD" }; - yield return new object[] { new byte[] { 0xA3, 0xA4 }, 0, 2, "\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0x65, 0xA3, 0xA4 }, 0, 3, "\u0065\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0x47, 0x65, 0xA3, 0xA4 }, 0, 4, "\u0047\u0065\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0xA3, 0xA4 }, 0, 5, "\uFFFD\uFFFD\u0061\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0xA3 }, 0, 4, "\uFFFD\uFFFD\u0061\uFFFD" }; - yield return new object[] { new byte[] { 0xD0, 0x61, 0xA3 }, 0, 3, "\uFFFD\u0061\uFFFD" }; - yield return new object[] { new byte[] { 0xA4, 0x61, 0xA3 }, 0, 3, "\uFFFD\u0061\uFFFD" }; - yield return new object[] { new byte[] { 0xD0, 0x61, 0x52, 0xA3 }, 0, 4, "\uFFFD\u0061\u0052\uFFFD" }; - - yield return new object[] { new byte[] { 0xAA }, 0, 1, "\uFFFD" }; - yield return new object[] { new byte[] { 0xAA, 0x41 }, 0, 2, "\uFFFD\u0041" }; - - yield return new object[] { new byte[] { 0xEF, 0xFF, 0xEE }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xEF, 0xFF, 0xAE }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; - - yield return new object[] { new byte[] { 0x80, 0x90, 0xA0, 0xB0, 0xC1 }, 0, 5, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x90, 0xA0, 0xB0, 0xC1 }, 0, 15, "\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0x80, 0x90, 0xA0, 0xB0, 0xC1, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }, 0, 15, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F" }; - - yield return new object[] { new byte[] { 0xC2, 0x7F, 0xC2, 0xC0, 0xDF, 0x7F, 0xDF, 0xC0 }, 0, 8, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xC2, 0xDF }, 0, 2, "\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0x80, 0x80, 0xC1, 0x80, 0xC1, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xC2, 0x7F, 0xC2, 0xC0, 0x7F, 0x7F, 0x7F, 0x7F, 0xC3, 0xA1, 0xDF, 0x7F, 0xDF, 0xC0 }, 0, 14, "\uFFFD\u007F\uFFFD\uFFFD\u007F\u007F\u007F\u007F\u00E1\uFFFD\u007F\uFFFD\uFFFD" }; - - yield return new object[] { new byte[] { 0xE0, 0xA0, 0x7F, 0xE0, 0xA0, 0xC0, 0xE0, 0xBF, 0x7F, 0xE0, 0xBF, 0xC0 }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xE0, 0x9F, 0x80, 0xE0, 0xC0, 0x80, 0xE0, 0x9F, 0xBF, 0xE0, 0xC0, 0xBF }, 0, 12, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xE0, 0xA0, 0x7F, 0xE0, 0xA0, 0xC0, 0x7F, 0xE0, 0xBF, 0x7F, 0xC3, 0xA1, 0xE0, 0xBF, 0xC0 }, 0, 15, "\uFFFD\u007F\uFFFD\uFFFD\u007F\uFFFD\u007F\u00E1\uFFFD\uFFFD" }; - - yield return new object[] { new byte[] { 0xE1, 0x80, 0x7F, 0xE1, 0x80, 0xC0, 0xE1, 0xBF, 0x7F, 0xE1, 0xBF, 0xC0, 0xEC, 0x80, 0x7F, 0xEC, 0x80, 0xC0, 0xEC, 0xBF, 0x7F, 0xEC, 0xBF, 0xC0 }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xE1, 0x7F, 0x80, 0xE1, 0xC0, 0x80, 0xE1, 0x7F, 0xBF, 0xE1, 0xC0, 0xBF, 0xEC, 0x7F, 0x80, 0xEC, 0xC0, 0x80, 0xEC, 0x7F, 0xBF, 0xEC, 0xC0, 0xBF }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; - - yield return new object[] { new byte[] { 0xED, 0x80, 0x7F, 0xED, 0x80, 0xC0, 0xED, 0x9F, 0x7F, 0xED, 0x9F, 0xC0 }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xED, 0x7F, 0x80, 0xED, 0xA0, 0x80, 0xED, 0x7F, 0xBF, 0xED, 0xA0, 0xBF }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xED, 0x7F, 0x80, 0xED, 0xA0, 0x80, 0xE8, 0x80, 0x80, 0xED, 0x7F, 0xBF, 0xED, 0xA0, 0xBF }, 0, 15, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u8000\uFFFD\u007F\uFFFD\uFFFD\uFFFD" }; - - yield return new object[] { new byte[] { 0xEE, 0x80, 0x7F, 0xEE, 0x80, 0xC0, 0xEE, 0xBF, 0x7F, 0xEE, 0xBF, 0xC0, 0xEF, 0x80, 0x7F, 0xEF, 0x80, 0xC0, 0xEF, 0xBF, 0x7F, 0xEF, 0xBF, 0xC0 }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xEE, 0x7F, 0x80, 0xEE, 0xC0, 0x80, 0xEE, 0x7F, 0xBF, 0xEE, 0xC0, 0xBF, 0xEF, 0x7F, 0x80, 0xEF, 0xC0, 0x80, 0xEF, 0x7F, 0xBF, 0xEF, 0xC0, 0xBF }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; - - yield return new object[] { new byte[] { 0xF0, 0x90, 0x80, 0x7F, 0xF0, 0x90, 0x80, 0xC0, 0xF0, 0xBF, 0xBF, 0x7F, 0xF0, 0xBF, 0xBF, 0xC0 }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xF0, 0x90, 0x7F, 0x80, 0xF0, 0x90, 0xC0, 0x80, 0xF0, 0x90, 0x7F, 0xBF, 0xF0, 0x90, 0xC0, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xF0, 0x8F, 0x80, 0x80, 0xF0, 0xC0, 0x80, 0x80, 0xF0, 0x8F, 0xBF, 0xBF, 0xF0, 0xC0, 0xBF, 0xBF }, 0, 16, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; - - yield return new object[] { new byte[] { 0xF1, 0x80, 0x80, 0x7F, 0xF1, 0x80, 0x80, 0xC0, 0xF1, 0xBF, 0xBF, 0x7F, 0xF1, 0xBF, 0xBF, 0xC0, 0xF3, 0x80, 0x80, 0x7F, 0xF3, 0x80, 0x80, 0xC0, 0xF3, 0xBF, 0xBF, 0x7F, 0xF3, 0xBF, 0xBF, 0xC0 }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xF1, 0x80, 0x7F, 0x80, 0xF1, 0x80, 0xC0, 0x80, 0xF1, 0x80, 0x7F, 0xBF, 0xF1, 0x80, 0xC0, 0xBF, 0xF3, 0x80, 0x7F, 0x80, 0xF3, 0x80, 0xC0, 0x80, 0xF3, 0x80, 0x7F, 0xBF, 0xF3, 0x80, 0xC0, 0xBF }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xF1, 0x7F, 0x80, 0x80, 0xF1, 0xC0, 0x80, 0x80, 0xF1, 0x7F, 0xBF, 0xBF, 0xF1, 0xC0, 0xBF, 0xBF, 0xF3, 0x7F, 0x80, 0x80, 0xF3, 0xC0, 0x80, 0x80, 0xF3, 0x7F, 0xBF, 0xBF, 0xF3, 0xC0, 0xBF, 0xBF }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; - - yield return new object[] { new byte[] { 0xF4, 0x80, 0x80, 0x7F, 0xF4, 0x80, 0x80, 0xC0, 0xF4, 0x8F, 0xBF, 0x7F, 0xF4, 0x8F, 0xBF, 0xC0 }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xF4, 0x80, 0x7F, 0x80, 0xF4, 0x80, 0xC0, 0x80, 0xF4, 0x80, 0x7F, 0xBF, 0xF4, 0x80, 0xC0, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; - yield return new object[] { new byte[] { 0xF4, 0x7F, 0x80, 0x80, 0xF4, 0x90, 0x80, 0x80, 0xF4, 0x7F, 0xBF, 0xBF, 0xF4, 0x90, 0xBF, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + if (PlatformDetection.IsNetCore) + { + // Overlong 2-byte sequences + yield return new object[] { new byte[] { 0xC0, 0x80 }, 0, 2, "\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xC1, 0x80 }, 0, 2, "\uFFFD\uFFFD" }; + + // Incomplete 2-byte sequences + yield return new object[] { new byte[] { 0xC2, 0x41 }, 0, 2, "\uFFFD\u0041" }; + yield return new object[] { new byte[] { 0xC2, 0x41 }, 0, 2, "\uFFFD\u0041" }; + + // Overlong 3-byte sequences + yield return new object[] { new byte[] { 0xE0, 0x80, 0x80 }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xE0, 0x9F, 0x80 }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; + + // Truncated 3-byte sequences + yield return new object[] { new byte[] { 0xE0, 0xA0, 0x41 }, 0, 3, "\uFFFD\u0041" }; + yield return new object[] { new byte[] { 0xED, 0x9F, 0x41 }, 0, 3, "\uFFFD\u0041" }; + + // UTF-16 surrogate code points (invalid to be encoded in UTF-8) + yield return new object[] { new byte[] { 0xED, 0xA0, 0x80 }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0xAF, 0xBF }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0xB0, 0x80 }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0xBF, 0xBF }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xED, 0xAF, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0xB0, 0x80, 0xED, 0xB0, 0x80 }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xED, 0xA0, 0x80 }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + + // Overlong 4-byte sequences + yield return new object[] { new byte[] { 0xF0, 0x80, 0x80 }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF0, 0x8F, 0x80 }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; + + // Truncated 4-byte sequences + yield return new object[] { new byte[] { 0xF0, 0x90, 0x41, 0x42 }, 0, 4, "\uFFFD\u0041\u0042" }; + yield return new object[] { new byte[] { 0xF0, 0x90, 0x80, 0x42 }, 0, 4, "\uFFFD\u0042" }; + + // Too high scalar value in surrogates + yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xEE, 0x80, 0x80 }, 0, 6, "\uFFFD\uFFFD\uFFFD\uE000" }; + yield return new object[] { new byte[] { 0xF4, 0x90, 0x80, 0x80 }, 0, 4, "\uFFFD\uFFFD\uFFFD\uFFFD" }; + + // More examples of overlong sequences. This can cause security + // vulnerabilities (e.g. MS00-078) so it is important we parse these as invalid. + yield return new object[] { new byte[] { 0xC0 }, 0, 1, "\uFFFD" }; + yield return new object[] { new byte[] { 0xC0, 0xAF }, 0, 2, "\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xE0, 0x80, 0xBF }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF0, 0x80, 0x80, 0xBF }, 0, 4, "\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF8, 0x80, 0x80, 0x80, 0xBF }, 0, 5, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xFC, 0x80, 0x80, 0x80, 0x80, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xC0, 0xBF }, 0, 2, "\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xE0, 0x9C, 0x90 }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF0, 0x8F, 0xA4, 0x80 }, 0, 4, "\uFFFD\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xEF, 0x41 }, 0, 2, "\uFFFD\u0041" }; + yield return new object[] { new byte[] { 0xEF, 0xBF, 0xAE }, 0, 1, "\uFFFD" }; + yield return new object[] { new byte[] { 0xEF, 0xBF, 0x41 }, 0, 3, "\uFFFD\u0041" }; + yield return new object[] { new byte[] { 0xEF, 0xBF, 0x61 }, 0, 3, "\uFFFD\u0061" }; + yield return new object[] { new byte[] { 0xEF, 0xBF, 0xEF, 0xBF, 0xAE }, 0, 5, "\uFFFD\uFFEE" }; + yield return new object[] { new byte[] { 0xEF, 0xBF, 0xC0, 0xBF }, 0, 4, "\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xF0, 0xC4, 0x80 }, 0, 3, "\uFFFD\u0100" }; + + yield return new object[] { new byte[] { 176 }, 0, 1, "\uFFFD" }; + yield return new object[] { new byte[] { 196 }, 0, 1, "\uFFFD" }; + + yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0x52, 0x7C, 0x7B, 0x41, 0x6E, 0x47, 0x65, 0xA3, 0xA4 }, 0, 12, "\uFFFD\uFFFD\u0061\u0052\u007C\u007B\u0041\u006E\u0047\u0065\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xA3 }, 0, 1, "\uFFFD" }; + yield return new object[] { new byte[] { 0xA3, 0xA4 }, 0, 2, "\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0x65, 0xA3, 0xA4 }, 0, 3, "\u0065\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0x47, 0x65, 0xA3, 0xA4 }, 0, 4, "\u0047\u0065\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0xA3, 0xA4 }, 0, 5, "\uFFFD\uFFFD\u0061\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0xA3 }, 0, 4, "\uFFFD\uFFFD\u0061\uFFFD" }; + yield return new object[] { new byte[] { 0xD0, 0x61, 0xA3 }, 0, 3, "\uFFFD\u0061\uFFFD" }; + yield return new object[] { new byte[] { 0xA4, 0x61, 0xA3 }, 0, 3, "\uFFFD\u0061\uFFFD" }; + yield return new object[] { new byte[] { 0xD0, 0x61, 0x52, 0xA3 }, 0, 4, "\uFFFD\u0061\u0052\uFFFD" }; + + yield return new object[] { new byte[] { 0xAA }, 0, 1, "\uFFFD" }; + yield return new object[] { new byte[] { 0xAA, 0x41 }, 0, 2, "\uFFFD\u0041" }; + + yield return new object[] { new byte[] { 0xEF, 0xFF, 0xEE }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xEF, 0xFF, 0xAE }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0x80, 0x90, 0xA0, 0xB0, 0xC1 }, 0, 5, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x90, 0xA0, 0xB0, 0xC1 }, 0, 15, "\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0x80, 0x90, 0xA0, 0xB0, 0xC1, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }, 0, 15, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F" }; + + yield return new object[] { new byte[] { 0xC2, 0x7F, 0xC2, 0xC0, 0xDF, 0x7F, 0xDF, 0xC0 }, 0, 8, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xC2, 0xDF }, 0, 2, "\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0x80, 0x80, 0xC1, 0x80, 0xC1, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xC2, 0x7F, 0xC2, 0xC0, 0x7F, 0x7F, 0x7F, 0x7F, 0xC3, 0xA1, 0xDF, 0x7F, 0xDF, 0xC0 }, 0, 14, "\uFFFD\u007F\uFFFD\uFFFD\u007F\u007F\u007F\u007F\u00E1\uFFFD\u007F\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xE0, 0xA0, 0x7F, 0xE0, 0xA0, 0xC0, 0xE0, 0xBF, 0x7F, 0xE0, 0xBF, 0xC0 }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xE0, 0x9F, 0x80, 0xE0, 0xC0, 0x80, 0xE0, 0x9F, 0xBF, 0xE0, 0xC0, 0xBF }, 0, 12, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xE0, 0xA0, 0x7F, 0xE0, 0xA0, 0xC0, 0x7F, 0xE0, 0xBF, 0x7F, 0xC3, 0xA1, 0xE0, 0xBF, 0xC0 }, 0, 15, "\uFFFD\u007F\uFFFD\uFFFD\u007F\uFFFD\u007F\u00E1\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xE1, 0x80, 0x7F, 0xE1, 0x80, 0xC0, 0xE1, 0xBF, 0x7F, 0xE1, 0xBF, 0xC0, 0xEC, 0x80, 0x7F, 0xEC, 0x80, 0xC0, 0xEC, 0xBF, 0x7F, 0xEC, 0xBF, 0xC0 }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xE1, 0x7F, 0x80, 0xE1, 0xC0, 0x80, 0xE1, 0x7F, 0xBF, 0xE1, 0xC0, 0xBF, 0xEC, 0x7F, 0x80, 0xEC, 0xC0, 0x80, 0xEC, 0x7F, 0xBF, 0xEC, 0xC0, 0xBF }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xED, 0x80, 0x7F, 0xED, 0x80, 0xC0, 0xED, 0x9F, 0x7F, 0xED, 0x9F, 0xC0 }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0x7F, 0x80, 0xED, 0xA0, 0x80, 0xED, 0x7F, 0xBF, 0xED, 0xA0, 0xBF }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0x7F, 0x80, 0xED, 0xA0, 0x80, 0xE8, 0x80, 0x80, 0xED, 0x7F, 0xBF, 0xED, 0xA0, 0xBF }, 0, 15, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\u8000\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xEE, 0x80, 0x7F, 0xEE, 0x80, 0xC0, 0xEE, 0xBF, 0x7F, 0xEE, 0xBF, 0xC0, 0xEF, 0x80, 0x7F, 0xEF, 0x80, 0xC0, 0xEF, 0xBF, 0x7F, 0xEF, 0xBF, 0xC0 }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xEE, 0x7F, 0x80, 0xEE, 0xC0, 0x80, 0xEE, 0x7F, 0xBF, 0xEE, 0xC0, 0xBF, 0xEF, 0x7F, 0x80, 0xEF, 0xC0, 0x80, 0xEF, 0x7F, 0xBF, 0xEF, 0xC0, 0xBF }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xF0, 0x90, 0x80, 0x7F, 0xF0, 0x90, 0x80, 0xC0, 0xF0, 0xBF, 0xBF, 0x7F, 0xF0, 0xBF, 0xBF, 0xC0 }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF0, 0x90, 0x7F, 0x80, 0xF0, 0x90, 0xC0, 0x80, 0xF0, 0x90, 0x7F, 0xBF, 0xF0, 0x90, 0xC0, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF0, 0x8F, 0x80, 0x80, 0xF0, 0xC0, 0x80, 0x80, 0xF0, 0x8F, 0xBF, 0xBF, 0xF0, 0xC0, 0xBF, 0xBF }, 0, 16, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xF1, 0x80, 0x80, 0x7F, 0xF1, 0x80, 0x80, 0xC0, 0xF1, 0xBF, 0xBF, 0x7F, 0xF1, 0xBF, 0xBF, 0xC0, 0xF3, 0x80, 0x80, 0x7F, 0xF3, 0x80, 0x80, 0xC0, 0xF3, 0xBF, 0xBF, 0x7F, 0xF3, 0xBF, 0xBF, 0xC0 }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF1, 0x80, 0x7F, 0x80, 0xF1, 0x80, 0xC0, 0x80, 0xF1, 0x80, 0x7F, 0xBF, 0xF1, 0x80, 0xC0, 0xBF, 0xF3, 0x80, 0x7F, 0x80, 0xF3, 0x80, 0xC0, 0x80, 0xF3, 0x80, 0x7F, 0xBF, 0xF3, 0x80, 0xC0, 0xBF }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF1, 0x7F, 0x80, 0x80, 0xF1, 0xC0, 0x80, 0x80, 0xF1, 0x7F, 0xBF, 0xBF, 0xF1, 0xC0, 0xBF, 0xBF, 0xF3, 0x7F, 0x80, 0x80, 0xF3, 0xC0, 0x80, 0x80, 0xF3, 0x7F, 0xBF, 0xBF, 0xF3, 0xC0, 0xBF, 0xBF }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xF4, 0x80, 0x80, 0x7F, 0xF4, 0x80, 0x80, 0xC0, 0xF4, 0x8F, 0xBF, 0x7F, 0xF4, 0x8F, 0xBF, 0xC0 }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF4, 0x80, 0x7F, 0x80, 0xF4, 0x80, 0xC0, 0x80, 0xF4, 0x80, 0x7F, 0xBF, 0xF4, 0x80, 0xC0, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF4, 0x7F, 0x80, 0x80, 0xF4, 0x90, 0x80, 0x80, 0xF4, 0x7F, 0xBF, 0xBF, 0xF4, 0x90, 0xBF, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + } + else + { + yield return new object[] { new byte[] { 0xED, 0xA0, 0x80 }, 0, 3, "\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0xAF, 0xBF }, 0, 3, "\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0xB0, 0x80 }, 0, 3, "\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0xBF, 0xBF }, 0, 3, "\uFFFD\uFFFD" }; + + // Invalid surrogate pair (low/low, high/high, low/high) + yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xED, 0xAF, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0xB0, 0x80, 0xED, 0xB0, 0x80 }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xED, 0xA0, 0x80 }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD" }; + + // Too high scalar value in surrogates + yield return new object[] { new byte[] { 0xED, 0xA0, 0x80, 0xEE, 0x80, 0x80 }, 0, 6, "\uFFFD\uFFFD\uE000" }; + yield return new object[] { new byte[] { 0xF4, 0x90, 0x80, 0x80 }, 0, 4, "\uFFFD\uFFFD\uFFFD" }; + + // These are examples of overlong sequences. This can cause security + // vulnerabilities (e.g. MS00-078) so it is important we parse these as invalid. + yield return new object[] { new byte[] { 0xC0 }, 0, 1, "\uFFFD" }; + yield return new object[] { new byte[] { 0xC0, 0xAF }, 0, 2, "\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xE0, 0x80, 0xBF }, 0, 3, "\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF0, 0x80, 0x80, 0xBF }, 0, 4, "\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF8, 0x80, 0x80, 0x80, 0xBF }, 0, 5, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xFC, 0x80, 0x80, 0x80, 0x80, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xC0, 0xBF }, 0, 2, "\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xE0, 0x9C, 0x90 }, 0, 3, "\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF0, 0x8F, 0xA4, 0x80 }, 0, 4, "\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xEF, 0x41 }, 0, 2, "\uFFFD\u0041" }; + yield return new object[] { new byte[] { 0xEF, 0xBF, 0xAE }, 0, 1, "\uFFFD" }; + yield return new object[] { new byte[] { 0xEF, 0xBF, 0x41 }, 0, 3, "\uFFFD\u0041" }; + yield return new object[] { new byte[] { 0xEF, 0xBF, 0x61 }, 0, 3, "\uFFFD\u0061" }; + yield return new object[] { new byte[] { 0xEF, 0xBF, 0xEF, 0xBF, 0xAE }, 0, 5, "\uFFFD\uFFEE" }; + yield return new object[] { new byte[] { 0xEF, 0xBF, 0xC0, 0xBF }, 0, 4, "\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xF0, 0xC4, 0x80 }, 0, 3, "\uFFFD\u0100" }; + + yield return new object[] { new byte[] { 176 }, 0, 1, "\uFFFD" }; + yield return new object[] { new byte[] { 196 }, 0, 1, "\uFFFD" }; + + yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0x52, 0x7C, 0x7B, 0x41, 0x6E, 0x47, 0x65, 0xA3, 0xA4 }, 0, 12, "\uFFFD\uFFFD\u0061\u0052\u007C\u007B\u0041\u006E\u0047\u0065\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xA3 }, 0, 1, "\uFFFD" }; + yield return new object[] { new byte[] { 0xA3, 0xA4 }, 0, 2, "\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0x65, 0xA3, 0xA4 }, 0, 3, "\u0065\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0x47, 0x65, 0xA3, 0xA4 }, 0, 4, "\u0047\u0065\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0xA3, 0xA4 }, 0, 5, "\uFFFD\uFFFD\u0061\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xA4, 0xD0, 0x61, 0xA3 }, 0, 4, "\uFFFD\uFFFD\u0061\uFFFD" }; + yield return new object[] { new byte[] { 0xD0, 0x61, 0xA3 }, 0, 3, "\uFFFD\u0061\uFFFD" }; + yield return new object[] { new byte[] { 0xA4, 0x61, 0xA3 }, 0, 3, "\uFFFD\u0061\uFFFD" }; + yield return new object[] { new byte[] { 0xD0, 0x61, 0x52, 0xA3 }, 0, 4, "\uFFFD\u0061\u0052\uFFFD" }; + + yield return new object[] { new byte[] { 0xAA }, 0, 1, "\uFFFD" }; + yield return new object[] { new byte[] { 0xAA, 0x41 }, 0, 2, "\uFFFD\u0041" }; + + yield return new object[] { new byte[] { 0xEF, 0xFF, 0xEE }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xEF, 0xFF, 0xAE }, 0, 3, "\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0x80, 0x90, 0xA0, 0xB0, 0xC1 }, 0, 5, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x90, 0xA0, 0xB0, 0xC1 }, 0, 15, "\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0x80, 0x90, 0xA0, 0xB0, 0xC1, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }, 0, 15, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F\u007F" }; + + yield return new object[] { new byte[] { 0xC2, 0x7F, 0xC2, 0xC0, 0xDF, 0x7F, 0xDF, 0xC0 }, 0, 8, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xC2, 0xDF }, 0, 2, "\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0x80, 0x80, 0xC1, 0x80, 0xC1, 0xBF }, 0, 6, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xC2, 0x7F, 0xC2, 0xC0, 0x7F, 0x7F, 0x7F, 0x7F, 0xC3, 0xA1, 0xDF, 0x7F, 0xDF, 0xC0 }, 0, 14, "\uFFFD\u007F\uFFFD\uFFFD\u007F\u007F\u007F\u007F\u00E1\uFFFD\u007F\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xE0, 0xA0, 0x7F, 0xE0, 0xA0, 0xC0, 0xE0, 0xBF, 0x7F, 0xE0, 0xBF, 0xC0 }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xE0, 0x9F, 0x80, 0xE0, 0xC0, 0x80, 0xE0, 0x9F, 0xBF, 0xE0, 0xC0, 0xBF }, 0, 12, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xE0, 0xA0, 0x7F, 0xE0, 0xA0, 0xC0, 0x7F, 0xE0, 0xBF, 0x7F, 0xC3, 0xA1, 0xE0, 0xBF, 0xC0 }, 0, 15, "\uFFFD\u007F\uFFFD\uFFFD\u007F\uFFFD\u007F\u00E1\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xE1, 0x80, 0x7F, 0xE1, 0x80, 0xC0, 0xE1, 0xBF, 0x7F, 0xE1, 0xBF, 0xC0, 0xEC, 0x80, 0x7F, 0xEC, 0x80, 0xC0, 0xEC, 0xBF, 0x7F, 0xEC, 0xBF, 0xC0 }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xE1, 0x7F, 0x80, 0xE1, 0xC0, 0x80, 0xE1, 0x7F, 0xBF, 0xE1, 0xC0, 0xBF, 0xEC, 0x7F, 0x80, 0xEC, 0xC0, 0x80, 0xEC, 0x7F, 0xBF, 0xEC, 0xC0, 0xBF }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xED, 0x80, 0x7F, 0xED, 0x80, 0xC0, 0xED, 0x9F, 0x7F, 0xED, 0x9F, 0xC0 }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0x7F, 0x80, 0xED, 0xA0, 0x80, 0xED, 0x7F, 0xBF, 0xED, 0xA0, 0xBF }, 0, 12, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xED, 0x7F, 0x80, 0xED, 0xA0, 0x80, 0xE8, 0x80, 0x80, 0xED, 0x7F, 0xBF, 0xED, 0xA0, 0xBF }, 0, 15, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u8000\uFFFD\u007F\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xEE, 0x80, 0x7F, 0xEE, 0x80, 0xC0, 0xEE, 0xBF, 0x7F, 0xEE, 0xBF, 0xC0, 0xEF, 0x80, 0x7F, 0xEF, 0x80, 0xC0, 0xEF, 0xBF, 0x7F, 0xEF, 0xBF, 0xC0 }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xEE, 0x7F, 0x80, 0xEE, 0xC0, 0x80, 0xEE, 0x7F, 0xBF, 0xEE, 0xC0, 0xBF, 0xEF, 0x7F, 0x80, 0xEF, 0xC0, 0x80, 0xEF, 0x7F, 0xBF, 0xEF, 0xC0, 0xBF }, 0, 24, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xF0, 0x90, 0x80, 0x7F, 0xF0, 0x90, 0x80, 0xC0, 0xF0, 0xBF, 0xBF, 0x7F, 0xF0, 0xBF, 0xBF, 0xC0 }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF0, 0x90, 0x7F, 0x80, 0xF0, 0x90, 0xC0, 0x80, 0xF0, 0x90, 0x7F, 0xBF, 0xF0, 0x90, 0xC0, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF0, 0x8F, 0x80, 0x80, 0xF0, 0xC0, 0x80, 0x80, 0xF0, 0x8F, 0xBF, 0xBF, 0xF0, 0xC0, 0xBF, 0xBF }, 0, 16, "\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xF1, 0x80, 0x80, 0x7F, 0xF1, 0x80, 0x80, 0xC0, 0xF1, 0xBF, 0xBF, 0x7F, 0xF1, 0xBF, 0xBF, 0xC0, 0xF3, 0x80, 0x80, 0x7F, 0xF3, 0x80, 0x80, 0xC0, 0xF3, 0xBF, 0xBF, 0x7F, 0xF3, 0xBF, 0xBF, 0xC0 }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF1, 0x80, 0x7F, 0x80, 0xF1, 0x80, 0xC0, 0x80, 0xF1, 0x80, 0x7F, 0xBF, 0xF1, 0x80, 0xC0, 0xBF, 0xF3, 0x80, 0x7F, 0x80, 0xF3, 0x80, 0xC0, 0x80, 0xF3, 0x80, 0x7F, 0xBF, 0xF3, 0x80, 0xC0, 0xBF }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF1, 0x7F, 0x80, 0x80, 0xF1, 0xC0, 0x80, 0x80, 0xF1, 0x7F, 0xBF, 0xBF, 0xF1, 0xC0, 0xBF, 0xBF, 0xF3, 0x7F, 0x80, 0x80, 0xF3, 0xC0, 0x80, 0x80, 0xF3, 0x7F, 0xBF, 0xBF, 0xF3, 0xC0, 0xBF, 0xBF }, 0, 32, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + + yield return new object[] { new byte[] { 0xF4, 0x80, 0x80, 0x7F, 0xF4, 0x80, 0x80, 0xC0, 0xF4, 0x8F, 0xBF, 0x7F, 0xF4, 0x8F, 0xBF, 0xC0 }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF4, 0x80, 0x7F, 0x80, 0xF4, 0x80, 0xC0, 0x80, 0xF4, 0x80, 0x7F, 0xBF, 0xF4, 0x80, 0xC0, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD" }; + yield return new object[] { new byte[] { 0xF4, 0x7F, 0x80, 0x80, 0xF4, 0x90, 0x80, 0x80, 0xF4, 0x7F, 0xBF, 0xBF, 0xF4, 0x90, 0xBF, 0xBF }, 0, 16, "\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u007F\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD" }; + } } [Theory] @@ -229,5 +349,32 @@ public static void Decode_InvalidBytes(byte[] bytes, int index, int count, strin NegativeEncodingTests.Decode_Invalid(new UTF8Encoding(false, true), bytes, index, count); NegativeEncodingTests.Decode_Invalid(new UTF8Encoding(true, true), bytes, index, count); } + + [Theory] + [InlineData("", "ABCDEF")] + [InlineData("\uFFFD", "\uFFFDAB\uFFFDCD\uFFFDEF\uFFFD")] + [InlineData("?", "?AB?CD?EF?")] + [InlineData("\uFFFD?", "\uFFFD?AB\uFFFD?CD\uFFFD?EF\uFFFD?")] + public void Decode_InvalidChars_WithCustomReplacementFallback(string replacementString, string expected) + { + byte[] utf8Input = new byte[] + { + 0xC0, // always an invalid byte + (byte)'A', (byte)'B', + 0xF4, 0x80, 0xBF, // incomplete 4-byte sequence + (byte)'C', (byte)'D', + 0xE0, // incomplete 3-byte sequence + (byte)'E', (byte)'F', + 0xC2, // incomplete 2-byte sequence + }; + + Encoding utf8Encoding = Encoding.GetEncoding( + name: "utf-8", + encoderFallback: EncoderFallback.ExceptionFallback, + decoderFallback: new DecoderReplacementFallback(replacementString)); + + string actualUtf16Output = utf8Encoding.GetString(utf8Input); // pass in an invalid UTF-8 sequence + Assert.Equal(expected, actualUtf16Output); + } } } diff --git a/src/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingEncode.cs b/src/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingEncode.cs index e2e7bbc793a2..b15f4d10ea35 100644 --- a/src/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingEncode.cs +++ b/src/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingEncode.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using System.Collections.Generic; +using System.Linq; using Xunit; namespace System.Text.Tests @@ -248,5 +249,24 @@ public void Encode_InvalidChars(string chars, int index, int count, byte[] expec new UTF8Encoding(encoderShouldEmitUTF8Identifier: true, throwOnInvalidBytes: true), chars, index, count); } + + [Theory] + [InlineData("", "ABCDEF")] + [InlineData("\uFFFD", "\uFFFDAB\uFFFDCD\uFFFDEF\uFFFD")] + [InlineData("?", "?AB?CD?EF?")] + [InlineData("\uFFFD?", "\uFFFD?AB\uFFFD?CD\uFFFD?EF\uFFFD?")] + public void Encode_InvalidChars_WithCustomReplacementFallback(string replacementString, string expected) + { + byte[] expectedUtf8Output = expected.SelectMany(ch => (ch == '\uFFFD') ? new byte[] { 0xEF, 0xBF, 0xBD } : new byte[] { (byte)ch }).ToArray(); + + Encoding utf8Encoding = Encoding.GetEncoding( + name: "utf-8", + encoderFallback: new EncoderReplacementFallback(replacementString), + decoderFallback: DecoderFallback.ExceptionFallback); + + byte[] actualUtf8Output = utf8Encoding.GetBytes("\uD800AB\uDC00CD\uDFFFEF\uDBFF"); // pass in an invalid UTF-16 sequence + + Assert.Equal(expectedUtf8Output, actualUtf8Output); + } } }