From 197a28d3ebbd5e405862f2c6c30a76cf66917de8 Mon Sep 17 00:00:00 2001 From: Pavel Savara Date: Wed, 17 Mar 2021 19:38:39 +0100 Subject: [PATCH] Include "simple" UTF-8 validation and transcoding logic for interpreted and low-footprint scenarios (#49372) --- .../System.Private.CoreLib.Shared.projitems | 12 +- .../Text/Unicode/Utf8Utility.SizeOpt.cs | 107 ++++++++++++++++++ ...ers.cs => Utf8Utility.SpeedOpt.Helpers.cs} | 0 ...cs => Utf8Utility.SpeedOpt.Transcoding.cs} | 0 ....cs => Utf8Utility.SpeedOpt.Validation.cs} | 4 + 5 files changed, 120 insertions(+), 3 deletions(-) create mode 100644 src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.SizeOpt.cs rename src/libraries/System.Private.CoreLib/src/System/Text/Unicode/{Utf8Utility.Helpers.cs => Utf8Utility.SpeedOpt.Helpers.cs} (100%) rename src/libraries/System.Private.CoreLib/src/System/Text/Unicode/{Utf8Utility.Transcoding.cs => Utf8Utility.SpeedOpt.Transcoding.cs} (100%) rename src/libraries/System.Private.CoreLib/src/System/Text/Unicode/{Utf8Utility.Validation.cs => Utf8Utility.SpeedOpt.Validation.cs} (98%) diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems index 7aed1af79c2b2..4c521218901a5 100644 --- a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems +++ b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems @@ -16,6 +16,7 @@ true $(MSBuildThisFileDirectory)ILLink\ true + true $(DefineConstants);TARGET_32BIT @@ -945,9 +946,6 @@ - - - @@ -1171,6 +1169,14 @@ Common\System\Threading\Tasks\TaskToApm.cs + + + + + + + + Common\Interop\Windows\Advapi32\Interop.ActivityControl.cs diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.SizeOpt.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.SizeOpt.cs new file mode 100644 index 0000000000000..856d4d38d36bb --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.SizeOpt.cs @@ -0,0 +1,107 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Buffers; +using System.Diagnostics; + +namespace System.Text.Unicode +{ + internal static unsafe partial class Utf8Utility + { + // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where + // the next byte would have been consumed from / the next char would have been written to. + // inputLength in bytes, outputCharsRemaining in chars. + public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLength, char* pOutputBuffer, int outputCharsRemaining, out byte* pInputBufferRemaining, out char* pOutputBufferRemaining) + { + Debug.Assert(inputLength >= 0, "Input length must not be negative."); + Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); + + Debug.Assert(outputCharsRemaining >= 0, "Destination length must not be negative."); + Debug.Assert(pOutputBuffer != null || outputCharsRemaining == 0, "Destination length must be zero if destination buffer pointer is null."); + + var input = new ReadOnlySpan(pInputBuffer, inputLength); + var output = new Span(pOutputBuffer, outputCharsRemaining); + + OperationStatus opStatus = OperationStatus.Done; + while (!input.IsEmpty) + { + opStatus = Rune.DecodeFromUtf8(input, out Rune rune, out int bytesConsumedJustNow); + if (opStatus != OperationStatus.Done) { break; } + if (!rune.TryEncodeToUtf16(output, out int charsWrittenJustNow)) { opStatus = OperationStatus.DestinationTooSmall; break; } + input = input.Slice(bytesConsumedJustNow); + output = output.Slice(charsWrittenJustNow); + } + + pInputBufferRemaining = pInputBuffer + inputLength - input.Length; + pOutputBufferRemaining = pOutputBuffer + outputCharsRemaining - output.Length; + + return opStatus; + } + + // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where + // the next char would have been consumed from / the next byte would have been written to. + // inputLength in chars, outputBytesRemaining in bytes. + public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLength, byte* pOutputBuffer, int outputBytesRemaining, out char* pInputBufferRemaining, out byte* pOutputBufferRemaining) + { + Debug.Assert(inputLength >= 0, "Input length must not be negative."); + Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); + + Debug.Assert(outputBytesRemaining >= 0, "Destination length must not be negative."); + Debug.Assert(pOutputBuffer != null || outputBytesRemaining == 0, "Destination length must be zero if destination buffer pointer is null."); + + + var input = new ReadOnlySpan(pInputBuffer, inputLength); + var output = new Span(pOutputBuffer, outputBytesRemaining); + + OperationStatus opStatus = OperationStatus.Done; + while (!input.IsEmpty) + { + opStatus = Rune.DecodeFromUtf16(input, out Rune rune, out int charsConsumedJustNow); + if (opStatus != OperationStatus.Done) { break; } + if (!rune.TryEncodeToUtf8(output, out int bytesWrittenJustNow)) { opStatus = OperationStatus.DestinationTooSmall; break; } + input = input.Slice(charsConsumedJustNow); + output = output.Slice(bytesWrittenJustNow); + } + + pInputBufferRemaining = pInputBuffer + inputLength - input.Length; + pOutputBufferRemaining = pOutputBuffer + outputBytesRemaining - output.Length; + + return opStatus; + } + + // Returns &inputBuffer[inputLength] if the input buffer is valid. + /// + /// Given an input buffer of byte length , + /// returns a pointer to where the first invalid data appears in . + /// + /// + /// Returns a pointer to the end of if the buffer is well-formed. + /// + /// Pointer to Utf8 byte buffer + /// Buffer length in bytes + /// Zero or negative number to be added to the "bytes processed" return value to come up with the total UTF-16 code unit count. + /// Zero or negative number to be added to the "total UTF-16 code unit count" value to come up with the total scalar count. + public static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) + { + Debug.Assert(inputLength >= 0, "Input length must not be negative."); + Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); + + var input = new ReadOnlySpan(pInputBuffer, inputLength); + int cumulativeUtf16CodeUnitCount = 0; + int cumulativeScalarValueCount = 0; + while (!input.IsEmpty) + { + if (Rune.DecodeFromUtf8(input, out Rune rune, out int bytesConsumed) != OperationStatus.Done) + break; + input = input.Slice(bytesConsumed); + cumulativeUtf16CodeUnitCount += rune.Utf16SequenceLength; + cumulativeScalarValueCount++; + } + + int cumulativeBytesConsumed = inputLength - input.Length; + utf16CodeUnitCountAdjustment = cumulativeUtf16CodeUnitCount - cumulativeBytesConsumed; + scalarCountAdjustment = cumulativeScalarValueCount - cumulativeUtf16CodeUnitCount; + return pInputBuffer + cumulativeBytesConsumed; + } + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.SpeedOpt.Helpers.cs similarity index 100% rename from src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs rename to src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.SpeedOpt.Helpers.cs diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.SpeedOpt.Transcoding.cs similarity index 100% rename from src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs rename to src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.SpeedOpt.Transcoding.cs diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.SpeedOpt.Validation.cs similarity index 98% rename from src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs rename to src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.SpeedOpt.Validation.cs index 93c404d3245fd..e5424addb26b4 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.SpeedOpt.Validation.cs @@ -24,6 +24,10 @@ internal static unsafe partial class Utf8Utility /// /// Returns a pointer to the end of if the buffer is well-formed. /// + /// Pointer to Utf8 byte buffer + /// Buffer length in bytes + /// Zero or negative number to be added to the "bytes processed" return value to come up with the total UTF-16 code unit count. + /// Zero or negative number to be added to the "total UTF-16 code unit count" value to come up with the total scalar count. public static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { Debug.Assert(inputLength >= 0, "Input length must not be negative.");