From 2f0c80c8ae3a70c2e928c7ced7d718c98a1e6be2 Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Sun, 10 Mar 2019 23:12:42 -0700 Subject: [PATCH] Add ref APIs and unit tests for System.Text.Unicode.Utf8 --- .../ref/Configurations.props | 3 +- .../ref/CoreFx.Private.TestUtilities.csproj | 11 +- ...CoreFx.Private.TestUtilities.netcoreapp.cs | 30 ++ .../src/CoreFx.Private.TestUtilities.csproj | 7 + .../System/Buffers/BoundedMemory.Creation.cs | 67 ++++ .../src/System/Buffers/BoundedMemory.Unix.cs | 47 +++ .../System/Buffers/BoundedMemory.Windows.cs | 333 ++++++++++++++++++ .../src/System/Buffers/BoundedMemory.cs | 49 +++ .../src/System/Buffers/PoisonPagePlacement.cs | 26 ++ src/System.Runtime/ref/System.Runtime.cs | 8 + .../tests/System.Runtime.Tests.csproj | 3 + .../Unicode/Utf8Tests.ToBytes.netcoreapp.cs | 264 ++++++++++++++ .../Unicode/Utf8Tests.ToChars.netcoreapp.cs | 304 ++++++++++++++++ .../Text/Unicode/Utf8Tests.netcoreapp.cs | 141 ++++++++ 14 files changed, 1290 insertions(+), 3 deletions(-) create mode 100644 src/CoreFx.Private.TestUtilities/ref/CoreFx.Private.TestUtilities.netcoreapp.cs create mode 100644 src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.Creation.cs create mode 100644 src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.Unix.cs create mode 100644 src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.Windows.cs create mode 100644 src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.cs create mode 100644 src/CoreFx.Private.TestUtilities/src/System/Buffers/PoisonPagePlacement.cs create mode 100644 src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToBytes.netcoreapp.cs create mode 100644 src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToChars.netcoreapp.cs create mode 100644 src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.netcoreapp.cs diff --git a/src/CoreFx.Private.TestUtilities/ref/Configurations.props b/src/CoreFx.Private.TestUtilities/ref/Configurations.props index ff0d415e4593..04ae535c9867 100644 --- a/src/CoreFx.Private.TestUtilities/ref/Configurations.props +++ b/src/CoreFx.Private.TestUtilities/ref/Configurations.props @@ -1,7 +1,8 @@  + netcoreapp; netstandard; - \ No newline at end of file + diff --git a/src/CoreFx.Private.TestUtilities/ref/CoreFx.Private.TestUtilities.csproj b/src/CoreFx.Private.TestUtilities/ref/CoreFx.Private.TestUtilities.csproj index 01f758e62796..33f30d61e84f 100644 --- a/src/CoreFx.Private.TestUtilities/ref/CoreFx.Private.TestUtilities.csproj +++ b/src/CoreFx.Private.TestUtilities/ref/CoreFx.Private.TestUtilities.csproj @@ -1,13 +1,20 @@ - + {E2E59C98-998F-9965-991D-99411166AF6F} false true $(RepoRoot)\external\test-runtime\XUnit.Runtime.depproj - netstandard-Debug;netstandard-Release + netcoreapp-Debug;netcoreapp-Release;netstandard-Debug;netstandard-Release + + + + + + + \ No newline at end of file diff --git a/src/CoreFx.Private.TestUtilities/ref/CoreFx.Private.TestUtilities.netcoreapp.cs b/src/CoreFx.Private.TestUtilities/ref/CoreFx.Private.TestUtilities.netcoreapp.cs new file mode 100644 index 000000000000..ed68ba164397 --- /dev/null +++ b/src/CoreFx.Private.TestUtilities/ref/CoreFx.Private.TestUtilities.netcoreapp.cs @@ -0,0 +1,30 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +// ------------------------------------------------------------------------------ +// Changes to this file must follow the http://aka.ms/api-review process. +// ------------------------------------------------------------------------------ + +namespace System.Buffers +{ + public static partial class BoundedMemory + { + public static System.Buffers.BoundedMemory Allocate(int elementCount, System.Buffers.PoisonPagePlacement placement = System.Buffers.PoisonPagePlacement.After) where T : unmanaged { throw null; } + public static System.Buffers.BoundedMemory AllocateFromExistingData(System.ReadOnlySpan data, System.Buffers.PoisonPagePlacement placement = System.Buffers.PoisonPagePlacement.After) where T : unmanaged { throw null; } + public static System.Buffers.BoundedMemory AllocateFromExistingData(T[] data, System.Buffers.PoisonPagePlacement placement = System.Buffers.PoisonPagePlacement.After) where T : unmanaged { throw null; } + } + public abstract partial class BoundedMemory : IDisposable where T : unmanaged + { + public abstract bool IsReadonly { get; } + public abstract System.Memory Memory { get; } + public abstract System.Span Span { get; } + public abstract void Dispose(); + public abstract void MakeReadonly(); + public abstract void MakeWriteable(); + } + public enum PoisonPagePlacement + { + After = 0, + Before = 1, + } +} diff --git a/src/CoreFx.Private.TestUtilities/src/CoreFx.Private.TestUtilities.csproj b/src/CoreFx.Private.TestUtilities/src/CoreFx.Private.TestUtilities.csproj index 262fb54bd404..58b301b1e8d1 100644 --- a/src/CoreFx.Private.TestUtilities/src/CoreFx.Private.TestUtilities.csproj +++ b/src/CoreFx.Private.TestUtilities/src/CoreFx.Private.TestUtilities.csproj @@ -12,6 +12,13 @@ Test Utilities are not supported on this platform netcoreapp-Unix-Debug;netcoreapp-Unix-Release;netcoreapp-Windows_NT-Debug;netcoreapp-Windows_NT-Release;netcoreapp2.0-Unix-Debug;netcoreapp2.0-Unix-Release;netcoreapp2.0-Windows_NT-Debug;netcoreapp2.0-Windows_NT-Release;netcoreappaot-Windows_NT-Debug;netcoreappaot-Windows_NT-Release;netfx-Windows_NT-Debug;netfx-Windows_NT-Release;netstandard-Debug;netstandard-Release;uap-Windows_NT-Debug;uap-Windows_NT-Release;uapaot-Windows_NT-Debug;uapaot-Windows_NT-Release + + + + + + + diff --git a/src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.Creation.cs b/src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.Creation.cs new file mode 100644 index 000000000000..b77cd3c24687 --- /dev/null +++ b/src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.Creation.cs @@ -0,0 +1,67 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.InteropServices; + +namespace System.Buffers +{ + /// + /// Contains factory methods to create instances. + /// + public static partial class BoundedMemory + { + /// + /// Allocates a new region which is immediately preceded by + /// or immediately followed by a poison (MEM_NOACCESS) page. If + /// is , then attempting to read the memory + /// immediately before the returned will result in an AV. + /// If is , then + /// attempting to read the memory immediately after the returned + /// will result in AV. + /// + /// + /// The newly-allocated memory will be populated with random data. + /// + public static BoundedMemory Allocate(int elementCount, PoisonPagePlacement placement = PoisonPagePlacement.After) where T : unmanaged + { + if (elementCount < 0) + { + throw new ArgumentOutOfRangeException(nameof(elementCount)); + } + if (placement != PoisonPagePlacement.Before && placement != PoisonPagePlacement.After) + { + throw new ArgumentOutOfRangeException(nameof(placement)); + } + + var retVal = AllocateWithoutDataPopulation(elementCount, placement); + new Random().NextBytes(MemoryMarshal.AsBytes(retVal.Span)); // doesn't need to be cryptographically strong + return retVal; + } + + /// + /// Similar to , but populates the allocated + /// native memory block from existing data rather than using random data. + /// + public static BoundedMemory AllocateFromExistingData(ReadOnlySpan data, PoisonPagePlacement placement = PoisonPagePlacement.After) where T : unmanaged + { + if (placement != PoisonPagePlacement.Before && placement != PoisonPagePlacement.After) + { + throw new ArgumentOutOfRangeException(nameof(placement)); + } + + var retVal = AllocateWithoutDataPopulation(data.Length, placement); + data.CopyTo(retVal.Span); + return retVal; + } + + /// + /// Similar to , but populates the allocated + /// native memory block from existing data rather than using random data. + /// + public static BoundedMemory AllocateFromExistingData(T[] data, PoisonPagePlacement placement = PoisonPagePlacement.After) where T : unmanaged + { + return AllocateFromExistingData(new ReadOnlySpan(data), placement); + } + } +} diff --git a/src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.Unix.cs b/src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.Unix.cs new file mode 100644 index 000000000000..aa9d87a397b2 --- /dev/null +++ b/src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.Unix.cs @@ -0,0 +1,47 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace System.Buffers +{ + public static partial class BoundedMemory + { + private static UnixImplementation AllocateWithoutDataPopulation(int elementCount, PoisonPagePlacement placement) where T : unmanaged + { + // On non-Windows platforms, we don't yet have support for changing the permissions of individual pages. + + return new UnixImplementation(elementCount); + } + + private sealed class UnixImplementation : BoundedMemory where T : unmanaged + { + private readonly T[] _buffer; + + public UnixImplementation(int elementCount) + { + _buffer = new T[elementCount]; + } + + public override bool IsReadonly => false; + + public override Memory Memory => _buffer; + + public override Span Span => _buffer; + + public override void Dispose() + { + // no-op + } + + public override void MakeReadonly() + { + // no-op + } + + public override void MakeWriteable() + { + // no-op + } + } + } +} diff --git a/src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.Windows.cs b/src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.Windows.cs new file mode 100644 index 000000000000..d60df689a5e5 --- /dev/null +++ b/src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.Windows.cs @@ -0,0 +1,333 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Runtime.ConstrainedExecution; +using System.Runtime.InteropServices; +using System.Security; + +namespace System.Buffers +{ + public static unsafe partial class BoundedMemory + { + private static readonly int SystemPageSize = Environment.SystemPageSize; + + private static WindowsImplementation AllocateWithoutDataPopulation(int elementCount, PoisonPagePlacement placement) where T : unmanaged + { + long cb, totalBytesToAllocate; + checked + { + cb = elementCount * sizeof(T); + totalBytesToAllocate = cb; + + // We only need to round the count up if it's not an exact multiple + // of the system page size. + + var leftoverBytes = totalBytesToAllocate % SystemPageSize; + if (leftoverBytes != 0) + { + totalBytesToAllocate += SystemPageSize - leftoverBytes; + } + + // Finally, account for the poison pages at the front and back. + + totalBytesToAllocate += 2 * SystemPageSize; + } + + // Reserve and commit the entire range as NOACCESS. + + var handle = UnsafeNativeMethods.VirtualAlloc( + lpAddress: IntPtr.Zero, + dwSize: (IntPtr)totalBytesToAllocate /* cast throws OverflowException if out of range */, + flAllocationType: VirtualAllocAllocationType.MEM_RESERVE | VirtualAllocAllocationType.MEM_COMMIT, + flProtect: VirtualAllocProtection.PAGE_NOACCESS); + + if (handle == null || handle.IsInvalid) + { + Marshal.ThrowExceptionForHR(Marshal.GetHRForLastWin32Error()); + throw new InvalidOperationException("VirtualAlloc failed unexpectedly."); + } + + // Done allocating! Now carve out a READWRITE section bookended by the NOACCESS + // pages and return that carved-out section to the caller. Since memory protection + // flags only apply at page-level granularity, we need to "left-align" or "right- + // align" the section we carve out so that it's guaranteed adjacent to one of + // the NOACCESS bookend pages. + + return new WindowsImplementation( + handle: handle, + byteOffsetIntoHandle: (placement == PoisonPagePlacement.Before) + ? SystemPageSize /* just after leading poison page */ + : checked((int)(totalBytesToAllocate - SystemPageSize - cb)) /* just before trailing poison page */, + elementCount: elementCount) + { + Protection = VirtualAllocProtection.PAGE_READWRITE + }; + } + + private sealed class WindowsImplementation : BoundedMemory where T : unmanaged + { + private readonly VirtualAllocHandle _handle; + private readonly int _byteOffsetIntoHandle; + private readonly int _elementCount; + private readonly BoundedMemoryManager _memoryManager; + + internal WindowsImplementation(VirtualAllocHandle handle, int byteOffsetIntoHandle, int elementCount) + { + _handle = handle; + _byteOffsetIntoHandle = byteOffsetIntoHandle; + _elementCount = elementCount; + _memoryManager = new BoundedMemoryManager(this); + } + + public override bool IsReadonly => (Protection != VirtualAllocProtection.PAGE_READWRITE); + + internal VirtualAllocProtection Protection + { + get + { + bool refAdded = false; + try + { + _handle.DangerousAddRef(ref refAdded); + if (UnsafeNativeMethods.VirtualQuery( + lpAddress: _handle.DangerousGetHandle() + _byteOffsetIntoHandle, + lpBuffer: out var memoryInfo, + dwLength: (IntPtr)sizeof(MEMORY_BASIC_INFORMATION)) == IntPtr.Zero) + { + Marshal.ThrowExceptionForHR(Marshal.GetHRForLastWin32Error()); + throw new InvalidOperationException("VirtualQuery failed unexpectedly."); + } + return memoryInfo.Protect; + } + finally + { + if (refAdded) + { + _handle.DangerousRelease(); + } + } + } + set + { + if (_elementCount > 0) + { + bool refAdded = false; + try + { + _handle.DangerousAddRef(ref refAdded); + if (!UnsafeNativeMethods.VirtualProtect( + lpAddress: _handle.DangerousGetHandle() + _byteOffsetIntoHandle, + dwSize: (IntPtr)(&((T*)null)[_elementCount]), + flNewProtect: value, + lpflOldProtect: out _)) + { + Marshal.ThrowExceptionForHR(Marshal.GetHRForLastWin32Error()); + throw new InvalidOperationException("VirtualProtect failed unexpectedly."); + } + } + finally + { + if (refAdded) + { + _handle.DangerousRelease(); + } + } + } + } + } + + public override Memory Memory => _memoryManager.Memory; + + public override Span Span + { + get + { + bool refAdded = false; + try + { + _handle.DangerousAddRef(ref refAdded); + return new Span((void*)(_handle.DangerousGetHandle() + _byteOffsetIntoHandle), _elementCount); + } + finally + { + if (refAdded) + { + _handle.DangerousRelease(); + } + } + } + } + + public override void Dispose() + { + _handle.Dispose(); + } + + public override void MakeReadonly() + { + Protection = VirtualAllocProtection.PAGE_READONLY; + } + + public override void MakeWriteable() + { + Protection = VirtualAllocProtection.PAGE_READWRITE; + } + + private sealed class BoundedMemoryManager : MemoryManager + { + private readonly WindowsImplementation _impl; + + public BoundedMemoryManager(WindowsImplementation impl) + { + _impl = impl; + } + + public override Memory Memory => CreateMemory(_impl._elementCount); + + protected override void Dispose(bool disposing) + { + // no-op; the handle will be disposed separately + } + + public override Span GetSpan() + { + throw new NotImplementedException(); + } + + public override MemoryHandle Pin(int elementIndex) + { + if ((uint)elementIndex > (uint)_impl._elementCount) + { + throw new ArgumentOutOfRangeException(paramName: nameof(elementIndex)); + } + + bool refAdded = false; + try + { + _impl._handle.DangerousAddRef(ref refAdded); + return new MemoryHandle((T*)(_impl._handle.DangerousGetHandle() + _impl._byteOffsetIntoHandle) + elementIndex); + } + finally + { + if (refAdded) + { + _impl._handle.DangerousRelease(); + } + } + } + + public override void Unpin() + { + // no-op - we don't unpin native memory + } + } + } + + // from winnt.h + [Flags] + private enum VirtualAllocAllocationType : uint + { + MEM_COMMIT = 0x1000, + MEM_RESERVE = 0x2000, + MEM_DECOMMIT = 0x4000, + MEM_RELEASE = 0x8000, + MEM_FREE = 0x10000, + MEM_PRIVATE = 0x20000, + MEM_MAPPED = 0x40000, + MEM_RESET = 0x80000, + MEM_TOP_DOWN = 0x100000, + MEM_WRITE_WATCH = 0x200000, + MEM_PHYSICAL = 0x400000, + MEM_ROTATE = 0x800000, + MEM_LARGE_PAGES = 0x20000000, + MEM_4MB_PAGES = 0x80000000, + } + + // from winnt.h + [Flags] + private enum VirtualAllocProtection : uint + { + PAGE_NOACCESS = 0x01, + PAGE_READONLY = 0x02, + PAGE_READWRITE = 0x04, + PAGE_WRITECOPY = 0x08, + PAGE_EXECUTE = 0x10, + PAGE_EXECUTE_READ = 0x20, + PAGE_EXECUTE_READWRITE = 0x40, + PAGE_EXECUTE_WRITECOPY = 0x80, + PAGE_GUARD = 0x100, + PAGE_NOCACHE = 0x200, + PAGE_WRITECOMBINE = 0x400, + } + + [StructLayout(LayoutKind.Sequential)] + private struct MEMORY_BASIC_INFORMATION + { + public IntPtr BaseAddress; + public IntPtr AllocationBase; + public VirtualAllocProtection AllocationProtect; + public IntPtr RegionSize; + public VirtualAllocAllocationType State; + public VirtualAllocProtection Protect; + public VirtualAllocAllocationType Type; + }; + + private sealed class VirtualAllocHandle : SafeHandle + { + // Called by P/Invoke when returning SafeHandles + private VirtualAllocHandle() + : base(IntPtr.Zero, ownsHandle: true) + { + } + + // Do not provide a finalizer - SafeHandle's critical finalizer will + // call ReleaseHandle for you. + + public override bool IsInvalid => (handle == IntPtr.Zero); + + protected override bool ReleaseHandle() => + UnsafeNativeMethods.VirtualFree(handle, IntPtr.Zero, VirtualAllocAllocationType.MEM_RELEASE); + } + + [SuppressUnmanagedCodeSecurity] + private static class UnsafeNativeMethods + { + private const string KERNEL32_LIB = "kernel32.dll"; + + // https://msdn.microsoft.com/en-us/library/windows/desktop/aa366887(v=vs.85).aspx + [DllImport(KERNEL32_LIB, CallingConvention = CallingConvention.Winapi, SetLastError = true)] + public static extern VirtualAllocHandle VirtualAlloc( + [In] IntPtr lpAddress, + [In] IntPtr dwSize, + [In] VirtualAllocAllocationType flAllocationType, + [In] VirtualAllocProtection flProtect); + + // https://msdn.microsoft.com/en-us/library/windows/desktop/aa366892(v=vs.85).aspx + [DllImport(KERNEL32_LIB, CallingConvention = CallingConvention.Winapi, SetLastError = true)] + [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)] + [return: MarshalAs(UnmanagedType.Bool)] + public static extern bool VirtualFree( + [In] IntPtr lpAddress, + [In] IntPtr dwSize, + [In] VirtualAllocAllocationType dwFreeType); + + // https://msdn.microsoft.com/en-us/library/windows/desktop/aa366898(v=vs.85).aspx + [DllImport(KERNEL32_LIB, CallingConvention = CallingConvention.Winapi, SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static extern bool VirtualProtect( + [In] IntPtr lpAddress, + [In] IntPtr dwSize, + [In] VirtualAllocProtection flNewProtect, + [Out] out VirtualAllocProtection lpflOldProtect); + + // https://msdn.microsoft.com/en-us/library/windows/desktop/aa366902(v=vs.85).aspx + [DllImport(KERNEL32_LIB, CallingConvention = CallingConvention.Winapi, SetLastError = true)] + public static extern IntPtr VirtualQuery( + [In] IntPtr lpAddress, + [Out] out MEMORY_BASIC_INFORMATION lpBuffer, + [In] IntPtr dwLength); + } + } +} diff --git a/src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.cs b/src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.cs new file mode 100644 index 000000000000..cc39ff3cd804 --- /dev/null +++ b/src/CoreFx.Private.TestUtilities/src/System/Buffers/BoundedMemory.cs @@ -0,0 +1,49 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace System.Buffers +{ + /// + /// Represents a region of native memory. The property can be used + /// to get a backed by this memory region. + /// + public abstract class BoundedMemory : IDisposable where T : unmanaged + { + /// + /// Returns a value stating whether this native memory block is readonly. + /// + public abstract bool IsReadonly { get; } + + /// + /// Gets the which represents this native memory. + /// This instance must be kept alive while working with the . + /// + public abstract Memory Memory { get; } + + /// + /// Gets the which represents this native memory. + /// This instance must be kept alive while working with the . + /// + public abstract Span Span { get; } + + /// + /// Disposes this instance. + /// + public abstract void Dispose(); + + /// + /// Sets this native memory block to be readonly. Writes to this block will cause an AV. + /// This method has no effect if the memory block is zero length or if the underlying + /// OS does not support marking the memory block as readonly. + /// + public abstract void MakeReadonly(); + + /// + /// Sets this native memory block to be read+write. + /// This method has no effect if the memory block is zero length or if the underlying + /// OS does not support marking the memory block as read+write. + /// + public abstract void MakeWriteable(); + } +} \ No newline at end of file diff --git a/src/CoreFx.Private.TestUtilities/src/System/Buffers/PoisonPagePlacement.cs b/src/CoreFx.Private.TestUtilities/src/System/Buffers/PoisonPagePlacement.cs new file mode 100644 index 000000000000..ea2caa3136b5 --- /dev/null +++ b/src/CoreFx.Private.TestUtilities/src/System/Buffers/PoisonPagePlacement.cs @@ -0,0 +1,26 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace System.Buffers +{ + /// + /// Dictates where the poison page should be placed. + /// + public enum PoisonPagePlacement + { + /// + /// The poison page should be placed immediately after the memory region. + /// Attempting to access the memory page immediately following the + /// span will result in an AV. + /// + After, + + /// + /// The poison page should be placed immediately before the memory region. + /// Attempting to access the memory page immediately before the + /// span will result in an AV. + /// + Before, + } +} \ No newline at end of file diff --git a/src/System.Runtime/ref/System.Runtime.cs b/src/System.Runtime/ref/System.Runtime.cs index 116238ebc6b4..97ce8721d68b 100644 --- a/src/System.Runtime/ref/System.Runtime.cs +++ b/src/System.Runtime/ref/System.Runtime.cs @@ -7852,6 +7852,14 @@ void System.Collections.IEnumerator.Reset() { } void System.IDisposable.Dispose() { } } } +namespace System.Text.Unicode +{ + public static partial class Utf8 + { + public static System.Buffers.OperationStatus FromUtf16(ReadOnlySpan source, Span destination, out int numCharsRead, out int numBytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { throw null; } + public static System.Buffers.OperationStatus ToUtf16(ReadOnlySpan source, Span destination, out int numBytesRead, out int numCharsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { throw null; } + } +} namespace System.Threading { public readonly partial struct CancellationToken diff --git a/src/System.Runtime/tests/System.Runtime.Tests.csproj b/src/System.Runtime/tests/System.Runtime.Tests.csproj index 1627c8de70fb..a2f3d2e61383 100644 --- a/src/System.Runtime/tests/System.Runtime.Tests.csproj +++ b/src/System.Runtime/tests/System.Runtime.Tests.csproj @@ -285,6 +285,9 @@ + + + diff --git a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToBytes.netcoreapp.cs b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToBytes.netcoreapp.cs new file mode 100644 index 000000000000..18ceedc2f832 --- /dev/null +++ b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToBytes.netcoreapp.cs @@ -0,0 +1,264 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Linq; +using Xunit; + +namespace System.Text.Unicode.Tests +{ + public partial class Utf8Tests + { + [Theory] + [InlineData("", "")] // empty string is OK + [InlineData(X_UTF16, X_UTF8)] + [InlineData(E_ACUTE_UTF16, E_ACUTE_UTF8)] + [InlineData(EURO_SYMBOL_UTF16, EURO_SYMBOL_UTF8)] + public void ToBytes_WithSmallValidBuffers(string utf16Input, string expectedUtf8TranscodingHex) + { + // These test cases are for the "slow processing" code path at the end of TranscodeToUtf8, + // so inputs should be less than 2 chars. + + Assert.InRange(utf16Input.Length, 0, 1); + + ToBytes_Test_Core( + utf16Input: utf16Input, + destinationSize: expectedUtf8TranscodingHex.Length / 2, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.Done, + expectedNumCharsRead: utf16Input.Length, + expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex)); + } + + [Theory] + [InlineData("AB")] // 2 ASCII chars, hits fast inner loop + [InlineData("ABCD")] // 4 ASCII chars, hits fast inner loop + [InlineData("ABCDEF")] // 6 ASCII chars, hits fast inner loop + [InlineData("ABCDEFGH")] // 8 ASCII chars, hits fast inner loop + [InlineData("ABCDEFGHIJ")] // 10 ASCII chars, hits fast inner loop + [InlineData("ABCDEF" + E_ACUTE_UTF16 + "HIJ")] // interrupts inner loop due to non-ASCII char in first char of first DWORD + [InlineData("ABCDEFG" + EURO_SYMBOL_UTF16 + "IJ")] // interrupts inner loop due to non-ASCII char in second char of first DWORD + [InlineData("ABCDEFGH" + E_ACUTE_UTF16 + "J")] // interrupts inner loop due to non-ASCII char in first char of second DWORD + [InlineData("ABCDEFGHI" + EURO_SYMBOL_UTF16)] // interrupts inner loop due to non-ASCII char in second char of second DWORD + [InlineData(X_UTF16 + E_ACUTE_UTF16)] // drains first ASCII char then falls down to slow path + [InlineData(X_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16)] // drains first ASCII char then consumes 2x 2-byte sequences at once + [InlineData(E_ACUTE_UTF16 + X_UTF16)] // no first ASCII char to drain, consumes 2-byte seq followed by ASCII char + [InlineData(E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16)] // stay within 2x 2-byte sequence processing loop + [InlineData(E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + X_UTF16)] // break out of 2x 2-byte seq loop due to ASCII data in second char of DWORD + [InlineData(E_ACUTE_UTF16 + E_ACUTE_UTF16 + X_UTF16 + X_UTF16)] // break out of 2x 2-byte seq loop due to ASCII data in first char of DWORD + [InlineData(E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + EURO_SYMBOL_UTF16)] // break out of 2x 2-byte seq loop due to 3-byte data + [InlineData(E_ACUTE_UTF16 + EURO_SYMBOL_UTF16)] // 2-byte logic sees next char isn't ASCII, cannot read full DWORD from remaining input buffer, falls down to slow drain loop + [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + X_UTF16)] // 2x 3-byte logic can't read a full DWORD from next part of buffer, falls down to slow drain loop + [InlineData(EURO_SYMBOL_UTF16 + X_UTF16)] // 3-byte processing loop consumes trailing ASCII char, but can't read next DWORD, falls down to slow drain loop + [InlineData(EURO_SYMBOL_UTF16 + X_UTF16 + X_UTF16)] // 3-byte processing loop consumes trailing ASCII char, but can't read next DWORD, falls down to slow drain loop + [InlineData(EURO_SYMBOL_UTF16 + E_ACUTE_UTF16)] // 3-byte processing loop can't consume next ASCII char, can't read DWORD, falls down to slow drain loop + [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16)] // stay within 2x 3-byte sequence processing loop + [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + X_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16)] // consume stray ASCII char at beginning of DWORD after 2x 3-byte sequence + [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + X_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16)] // consume stray ASCII char at end of DWORD after 2x 3-byte sequence + [InlineData(EURO_SYMBOL_UTF16 + E_ACUTE_UTF16 + X_UTF16)] // consume 2-byte sequence as second char in DWORD which begins with 3-byte encoded char + [InlineData(EURO_SYMBOL_UTF16 + GRINNING_FACE_UTF16)] // 3-byte sequence followed by 4-byte sequence + [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + GRINNING_FACE_UTF16)] // 2x 3-byte sequence followed by 4-byte sequence + [InlineData(GRINNING_FACE_UTF16)] // single 4-byte surrogate char pair + [InlineData(GRINNING_FACE_UTF16 + EURO_SYMBOL_UTF16)] // 4-byte surrogate char pair, cannot read next DWORD, falls down to slow drain loop + public void ToBytes_WithLargeValidBuffers(string utf16Input) + { + // These test cases are for the "fast processing" code which is the main loop of TranscodeToUtf8, + // so inputs should be at least 2 chars. + + Assert.True(utf16Input.Length >= 2); + + // We're going to run the tests with destination buffer lengths ranging from 0 all the way + // to buffers large enough to hold the full output. This allows us to test logic that + // detects whether we're about to overrun our destination buffer and instead returns DestinationTooSmall. + + Rune[] enumeratedScalars = utf16Input.EnumerateRunes().ToArray(); + + // 0-length buffer test + ToBytes_Test_Core( + utf16Input: utf16Input, + destinationSize: 0, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.DestinationTooSmall, + expectedNumCharsRead: 0, + expectedUtf8Transcoding: ReadOnlySpan.Empty); + + int expectedNumCharsConsumed = 0; + byte[] concatenatedUtf8 = Array.Empty(); + + for (int i = 0; i < enumeratedScalars.Length; i++) + { + Rune thisScalar = enumeratedScalars[i]; + + // provide partial destination buffers all the way up to (but not including) enough to hold the next full scalar encoding + for (int j = 1; j < thisScalar.Utf8SequenceLength; j++) + { + ToBytes_Test_Core( + utf16Input: utf16Input, + destinationSize: concatenatedUtf8.Length + j, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.DestinationTooSmall, + expectedNumCharsRead: expectedNumCharsConsumed, + expectedUtf8Transcoding: concatenatedUtf8); + } + + // now provide a destination buffer large enough to hold the next full scalar encoding + + expectedNumCharsConsumed += thisScalar.Utf16SequenceLength; + concatenatedUtf8 = concatenatedUtf8.Concat(ToUtf8(thisScalar)).ToArray(); + + ToBytes_Test_Core( + utf16Input: utf16Input, + destinationSize: concatenatedUtf8.Length, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: (i == enumeratedScalars.Length - 1) ? OperationStatus.Done : OperationStatus.DestinationTooSmall, + expectedNumCharsRead: expectedNumCharsConsumed, + expectedUtf8Transcoding: concatenatedUtf8); + } + } + + [Theory] + [InlineData('\uD800', OperationStatus.NeedMoreData)] // standalone high surrogate + [InlineData('\uDFFF', OperationStatus.InvalidData)] // standalone low surrogate + public void ToBytes_WithOnlyStandaloneSurrogates(char charValue, OperationStatus expectedOperationStatus) + { + ToBytes_Test_Core( + utf16Input: new[] { charValue }, + destinationSize: 0, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: expectedOperationStatus, + expectedNumCharsRead: 0, + expectedUtf8Transcoding: Span.Empty); + } + + [Theory] + [InlineData("", 0, "")] // swapped surrogate pair characters + [InlineData("A", 1, "41")] // consume standalone ASCII char, then swapped surrogate pair characters + [InlineData("AB", 1, "41")] // consume standalone ASCII char, then standalone high surrogate char + [InlineData("AB", 1, "41")] // consume standalone ASCII char, then standalone low surrogate char + [InlineData("AB", 2, "4142")] // consume two ASCII chars, then standalone high surrogate char + [InlineData("AB", 2, "4142")] // consume two ASCII chars, then standalone low surrogate char + public void ToBytes_WithInvalidSurrogates(string utf16Input, int expectedNumCharsConsumed, string expectedUtf8TranscodingHex) + { + // xUnit can't handle ill-formed strings in [InlineData], so we replace here. + + utf16Input = utf16Input.Replace("", "\uD800").Replace("", "\uDFFF"); + + // These test cases are for the "fast processing" code which is the main loop of TranscodeToUtf8, + // so inputs should be at least 2 chars. + + Assert.True(utf16Input.Length >= 2); + + ToBytes_Test_Core( + utf16Input: utf16Input, + destinationSize: expectedUtf8TranscodingHex.Length / 2, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.InvalidData, + expectedNumCharsRead: expectedNumCharsConsumed, + expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex)); + } + + [Theory] + [InlineData("", REPLACEMENT_CHAR_UTF8)] // standalone low surr. and incomplete high surr. + [InlineData("", REPLACEMENT_CHAR_UTF8)] // standalone high surr. and incomplete high surr. + [InlineData("", REPLACEMENT_CHAR_UTF8 + REPLACEMENT_CHAR_UTF8)] // standalone low surr. and incomplete low surr. + [InlineData("ABCD", "41" + REPLACEMENT_CHAR_UTF8 + "42" + REPLACEMENT_CHAR_UTF8 + "43" + REPLACEMENT_CHAR_UTF8 + "44")] // standalone low, low, high surrounded by other data + public void ToBytes_WithReplacements(string utf16Input, string expectedUtf8TranscodingHex) + { + // xUnit can't handle ill-formed strings in [InlineData], so we replace here. + + utf16Input = utf16Input.Replace("", "\uD800").Replace("", "\uDFFF"); + + bool isFinalCharHighSurrogate = char.IsHighSurrogate(utf16Input.Last()); + + ToBytes_Test_Core( + utf16Input: utf16Input, + destinationSize: expectedUtf8TranscodingHex.Length / 2, + replaceInvalidSequences: true, + isFinalChunk: false, + expectedOperationStatus: (isFinalCharHighSurrogate) ? OperationStatus.NeedMoreData : OperationStatus.Done, + expectedNumCharsRead: (isFinalCharHighSurrogate) ? (utf16Input.Length - 1) : utf16Input.Length, + expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex)); + + if (isFinalCharHighSurrogate) + { + // Also test with isFinalChunk = true + ToBytes_Test_Core( + utf16Input: utf16Input, + destinationSize: expectedUtf8TranscodingHex.Length / 2 + Rune.ReplacementChar.Utf8SequenceLength /* for replacement char */, + replaceInvalidSequences: true, + isFinalChunk: true, + expectedOperationStatus: OperationStatus.Done, + expectedNumCharsRead: utf16Input.Length, + expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex + REPLACEMENT_CHAR_UTF8)); + } + } + + [Theory] + [InlineData(E_ACUTE_UTF16 + "", true, 1, OperationStatus.DestinationTooSmall, E_ACUTE_UTF8)] // not enough output buffer to hold U+FFFD + [InlineData(E_ACUTE_UTF16 + "", true, 2, OperationStatus.Done, E_ACUTE_UTF8 + REPLACEMENT_CHAR_UTF8)] // replace standalone low surr. at end + [InlineData(E_ACUTE_UTF16 + "", true, 1, OperationStatus.DestinationTooSmall, E_ACUTE_UTF8)] // not enough output buffer to hold U+FFFD + [InlineData(E_ACUTE_UTF16 + "", true, 2, OperationStatus.Done, E_ACUTE_UTF8 + REPLACEMENT_CHAR_UTF8)] // replace standalone high surr. at end + [InlineData(E_ACUTE_UTF16 + "", false, 1, OperationStatus.NeedMoreData, E_ACUTE_UTF8)] // don't replace standalone high surr. at end + [InlineData(E_ACUTE_UTF16 + "" + X_UTF16, true, 2, OperationStatus.DestinationTooSmall, E_ACUTE_UTF8 + REPLACEMENT_CHAR_UTF8)] // not enough output buffer to hold 'X' + [InlineData(E_ACUTE_UTF16 + "" + X_UTF16, false, 2, OperationStatus.DestinationTooSmall, E_ACUTE_UTF8 + REPLACEMENT_CHAR_UTF8)] // not enough output buffer to hold 'X' + [InlineData(E_ACUTE_UTF16 + "" + X_UTF16, true, 3, OperationStatus.Done, E_ACUTE_UTF8 + REPLACEMENT_CHAR_UTF8 + X_UTF8)] // replacement followed by 'X' + [InlineData(E_ACUTE_UTF16 + "" + X_UTF16, false, 3, OperationStatus.Done, E_ACUTE_UTF8 + REPLACEMENT_CHAR_UTF8 + X_UTF8)] // replacement followed by 'X' + public void ToBytes_WithReplacements_AndCustomBufferSizes(string utf16Input, bool isFinalChunk, int expectedNumCharsConsumed, OperationStatus expectedOperationStatus, string expectedUtf8TranscodingHex) + { + // xUnit can't handle ill-formed strings in [InlineData], so we replace here. + + utf16Input = utf16Input.Replace("", "\uD800").Replace("", "\uDFFF"); + + ToBytes_Test_Core( + utf16Input: utf16Input, + destinationSize: expectedUtf8TranscodingHex.Length / 2, + replaceInvalidSequences: true, + isFinalChunk: isFinalChunk, + expectedOperationStatus: expectedOperationStatus, + expectedNumCharsRead: expectedNumCharsConsumed, + expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex)); + } + + [Fact] + public void ToBytes_AllPossibleScalarValues() + { + ToBytes_Test_Core( + utf16Input: s_allScalarsAsUtf16.Span, + destinationSize: s_allScalarsAsUtf8.Length, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.Done, + expectedNumCharsRead: s_allScalarsAsUtf16.Length, + expectedUtf8Transcoding: s_allScalarsAsUtf8.Span); + } + + private static void ToBytes_Test_Core(ReadOnlySpan utf16Input, int destinationSize, bool replaceInvalidSequences, bool isFinalChunk, OperationStatus expectedOperationStatus, int expectedNumCharsRead, ReadOnlySpan expectedUtf8Transcoding) + { + // Arrange + + using (BoundedMemory boundedSource = BoundedMemory.AllocateFromExistingData(utf16Input)) + using (BoundedMemory boundedDestination = BoundedMemory.Allocate(destinationSize)) + { + boundedSource.MakeReadonly(); + + // Act + + OperationStatus actualOperationStatus = Utf8.FromUtf16(boundedSource.Span, boundedDestination.Span, out int actualNumCharsRead, out int actualNumBytesWritten, replaceInvalidSequences, isFinalChunk); + + // Assert + + Assert.Equal(expectedOperationStatus, actualOperationStatus); + Assert.Equal(expectedNumCharsRead, actualNumCharsRead); + Assert.Equal(expectedUtf8Transcoding.Length, actualNumBytesWritten); + Assert.Equal(expectedUtf8Transcoding.ToArray(), boundedDestination.Span.Slice(0, actualNumBytesWritten).ToArray()); + } + } + } +} diff --git a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToChars.netcoreapp.cs b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToChars.netcoreapp.cs new file mode 100644 index 000000000000..6dda95dffc10 --- /dev/null +++ b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToChars.netcoreapp.cs @@ -0,0 +1,304 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Linq; +using Xunit; + +namespace System.Text.Unicode.Tests +{ + public partial class Utf8Tests + { + [Theory] + [InlineData("80", 0, "")] // sequence cannot begin with continuation character + [InlineData("8182", 0, "")] // sequence cannot begin with continuation character + [InlineData("838485", 0, "")] // sequence cannot begin with continuation character + [InlineData(X_UTF8 + "80", 1, X_UTF16)] // sequence cannot begin with continuation character + [InlineData(X_UTF8 + "8182", 1, X_UTF16)] // sequence cannot begin with continuation character + [InlineData("C0", 0, "")] // [ C0 ] is always invalid + [InlineData("C080", 0, "")] // [ C0 ] is always invalid + [InlineData("C08081", 0, "")] // [ C0 ] is always invalid + [InlineData(X_UTF8 + "C1", 1, X_UTF16)] // [ C1 ] is always invalid + [InlineData(X_UTF8 + "C180", 1, X_UTF16)] // [ C1 ] is always invalid + [InlineData(X_UTF8 + "C27F", 1, X_UTF16)] // [ C2 ] is improperly terminated + [InlineData("E2827F", 0, "")] // [ E2 82 ] is improperly terminated + [InlineData("E09F80", 0, "")] // [ E0 9F ... ] is overlong + [InlineData("E0C080", 0, "")] // [ E0 ] is improperly terminated + [InlineData("ED7F80", 0, "")] // [ ED ] is improperly terminated + [InlineData("EDA080", 0, "")] // [ ED A0 ... ] is surrogate + public void ToChars_WithSmallInvalidBuffers(string utf8HexInput, int expectedNumBytesConsumed, string expectedUtf16Transcoding) + { + // These test cases are for the "slow processing" code path at the end of TranscodeToUtf16, + // so inputs should be less than 4 bytes. + + Assert.InRange(utf8HexInput.Length, 0, 6); + + ToChars_Test_Core( + utf8Input: DecodeHex(utf8HexInput), + destinationSize: expectedUtf16Transcoding.Length, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.InvalidData, + expectedNumBytesRead: expectedNumBytesConsumed, + expectedUtf16Transcoding: expectedUtf16Transcoding); + } + + [Theory] + [InlineData("C2", 0, "")] // [ C2 ] is an incomplete sequence + [InlineData("E282", 0, "")] // [ E2 82 ] is an incomplete sequence + [InlineData(X_UTF8 + "C2", 1, X_UTF16)] // [ C2 ] is an incomplete sequence + [InlineData(X_UTF8 + "E0", 1, X_UTF16)] // [ E0 ] is an incomplete sequence + [InlineData(X_UTF8 + "E0BF", 1, X_UTF16)] // [ E0 BF ] is an incomplete sequence + [InlineData(X_UTF8 + "F0", 1, X_UTF16)] // [ F0 ] is an incomplete sequence + [InlineData(X_UTF8 + "F0BF", 1, X_UTF16)] // [ F0 BF ] is an incomplete sequence + [InlineData(X_UTF8 + "F0BFA0", 1, X_UTF16)] // [ F0 BF A0 ] is an incomplete sequence + [InlineData(E_ACUTE_UTF8 + "C2", 2, E_ACUTE_UTF16)] // [ C2 ] is an incomplete sequence + [InlineData(E_ACUTE_UTF8 + "E0", 2, E_ACUTE_UTF16)] // [ E0 ] is an incomplete sequence + [InlineData(E_ACUTE_UTF8 + "F0", 2, E_ACUTE_UTF16)] // [ F0 ] is an incomplete sequence + [InlineData(E_ACUTE_UTF8 + "E0BF", 2, E_ACUTE_UTF16)] // [ E0 BF ] is an incomplete sequence + [InlineData(E_ACUTE_UTF8 + "F0BF", 2, E_ACUTE_UTF16)] // [ F0 BF ] is an incomplete sequence + [InlineData(EURO_SYMBOL_UTF8 + "C2", 3, EURO_SYMBOL_UTF16)] // [ C2 ] is an incomplete sequence + [InlineData(EURO_SYMBOL_UTF8 + "E0", 3, EURO_SYMBOL_UTF16)] // [ E0 ] is an incomplete sequence + [InlineData(EURO_SYMBOL_UTF8 + "F0", 3, EURO_SYMBOL_UTF16)] // [ F0 ] is an incomplete sequence + public void ToChars_WithVariousIncompleteBuffers(string utf8HexInput, int expectedNumBytesConsumed, string expectedUtf16Transcoding) + { + // These test cases are for the "slow processing" code path at the end of TranscodeToUtf16, + // so inputs should be less than 4 bytes. + + ToChars_Test_Core( + utf8Input: DecodeHex(utf8HexInput), + destinationSize: expectedUtf16Transcoding.Length, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.NeedMoreData, + expectedNumBytesRead: expectedNumBytesConsumed, + expectedUtf16Transcoding: expectedUtf16Transcoding); + } + + [Theory] + /* SMALL VALID BUFFERS - tests drain loop at end of method */ + [InlineData("")] // empty string is OK + [InlineData("X")] + [InlineData("XY")] + [InlineData("XYZ")] + [InlineData(E_ACUTE_UTF16)] + [InlineData(X_UTF16 + E_ACUTE_UTF16)] + [InlineData(E_ACUTE_UTF16 + X_UTF16)] + [InlineData(EURO_SYMBOL_UTF16)] + /* LARGE VALID BUFFERS - test main loop at beginning of method */ + [InlineData(E_ACUTE_UTF16 + "ABCD" + "0123456789:;<=>?")] // Loop unrolling at end of buffer + [InlineData(E_ACUTE_UTF16 + "ABCD" + "0123456789:;<=>?" + "01234567" + E_ACUTE_UTF16 + "89:;<=>?")] // Loop unrolling interrupted by non-ASCII + [InlineData("ABC" + E_ACUTE_UTF16 + "0123")] // 3 ASCII bytes followed by non-ASCII + [InlineData("AB" + E_ACUTE_UTF16 + "0123")] // 2 ASCII bytes followed by non-ASCII + [InlineData("A" + E_ACUTE_UTF16 + "0123")] // 1 ASCII byte followed by non-ASCII + [InlineData(E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16)] // 4x 2-byte sequences, exercises optimization code path in 2-byte sequence processing + [InlineData(E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + "PQ")] // 3x 2-byte sequences + 2 ASCII bytes, exercises optimization code path in 2-byte sequence processing + [InlineData(E_ACUTE_UTF16 + "PQ")] // single 2-byte sequence + 2 trailing ASCII bytes, exercises draining logic in 2-byte sequence processing + [InlineData(E_ACUTE_UTF16 + "P" + E_ACUTE_UTF16 + "0@P")] // single 2-byte sequences + 1 trailing ASCII byte + 2-byte sequence, exercises draining logic in 2-byte sequence processing + [InlineData(EURO_SYMBOL_UTF16 + "@")] // single 3-byte sequence + 1 trailing ASCII byte, exercises draining logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL_UTF16 + "@P`")] // single 3-byte sequence + 3 trailing ASCII byte, exercises draining logic and "running out of data" logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16)] // 3x 3-byte sequences, exercises "stay within 3-byte loop" logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16)] // 4x 3-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + E_ACUTE_UTF16)] // 3x 3-byte sequences + single 2-byte sequence, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16)] // 2x 3-byte sequences + 4x 2-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing + [InlineData(GRINNING_FACE_UTF16 + GRINNING_FACE_UTF16)] // 2x 4-byte sequences, exercises 4-byte sequence processing + [InlineData(GRINNING_FACE_UTF16 + "@AB")] // single 4-byte sequence + 3 ASCII bytes, exercises 4-byte sequence processing and draining logic + [InlineData("\U0001F938\U0001F3FD\u200D\u2640\uFE0F")] // U+1F938 U+1F3FD U+200D U+2640 U+FE0F WOMAN CARTWHEELING: MEDIUM SKIN TONE, exercising switching between multiple sequence lengths + public void ToChars_ValidBuffers(string utf16Input) + { + // We're going to run the tests with destination buffer lengths ranging from 0 all the way + // to buffers large enough to hold the full output. This allows us to test logic that + // detects whether we're about to overrun our destination buffer and instead returns DestinationTooSmall. + + Rune[] enumeratedScalars = utf16Input.EnumerateRunes().ToArray(); + + // Convert entire input to UTF-8 using our unit test reference logic. + + byte[] utf8Input = enumeratedScalars.SelectMany(ToUtf8).ToArray(); + + // 0-length buffer test + ToChars_Test_Core( + utf8Input: utf8Input, + destinationSize: 0, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: (utf8Input.Length == 0) ? OperationStatus.Done : OperationStatus.DestinationTooSmall, + expectedNumBytesRead: 0, + expectedUtf16Transcoding: ReadOnlySpan.Empty); + + int expectedNumBytesConsumed = 0; + char[] concatenatedUtf16 = Array.Empty(); + + for (int i = 0; i < enumeratedScalars.Length; i++) + { + Rune thisScalar = enumeratedScalars[i]; + + // if this is an astral scalar value, quickly test a buffer that's not large enough to contain the entire UTF-16 encoding + + if (!thisScalar.IsBmp) + { + ToChars_Test_Core( + utf8Input: utf8Input, + destinationSize: concatenatedUtf16.Length + 1, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.DestinationTooSmall, + expectedNumBytesRead: expectedNumBytesConsumed, + expectedUtf16Transcoding: concatenatedUtf16); + } + + // now provide a destination buffer large enough to hold the next full scalar encoding + + expectedNumBytesConsumed += thisScalar.Utf8SequenceLength; + concatenatedUtf16 = concatenatedUtf16.Concat(ToUtf16(thisScalar)).ToArray(); + + ToChars_Test_Core( + utf8Input: utf8Input, + destinationSize: concatenatedUtf16.Length, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: (i == enumeratedScalars.Length - 1) ? OperationStatus.Done : OperationStatus.DestinationTooSmall, + expectedNumBytesRead: expectedNumBytesConsumed, + expectedUtf16Transcoding: concatenatedUtf16); + } + } + + [Theory] + [InlineData("3031" + "80" + "202122232425", 2, "01")] // Continuation character at start of sequence should match no bitmask + [InlineData("3031" + "C080" + "2021222324", 2, "01")] // Overlong 2-byte sequence at start of DWORD + [InlineData("3031" + "C180" + "2021222324", 2, "01")] // Overlong 2-byte sequence at start of DWORD + [InlineData("C280" + "C180", 2, "\u0080")] // Overlong 2-byte sequence at end of DWORD + [InlineData("C27F" + "C280", 0, "")] // Improperly terminated 2-byte sequence at start of DWORD + [InlineData("C2C0" + "C280", 0, "")] // Improperly terminated 2-byte sequence at start of DWORD + [InlineData("C280" + "C27F", 2, "\u0080")] // Improperly terminated 2-byte sequence at end of DWORD + [InlineData("C280" + "C2C0", 2, "\u0080")] // Improperly terminated 2-byte sequence at end of DWORD + [InlineData("C280" + "C280" + "80203040", 4, "\u0080\u0080")] // Continuation character at start of sequence, within "stay in 2-byte processing" optimization + [InlineData("C280" + "C280" + "C180" + "C280", 4, "\u0080\u0080")] // Overlong 2-byte sequence at start of DWORD, within "stay in 2-byte processing" optimization + [InlineData("C280" + "C280" + "C280" + "C180", 6, "\u0080\u0080\u0080")] // Overlong 2-byte sequence at end of DWORD, within "stay in 2-byte processing" optimization + [InlineData("3031" + "E09F80" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Overlong 3-byte sequence at start of DWORD + [InlineData("3031" + "E07F80" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Improperly terminated 3-byte sequence at start of DWORD + [InlineData("3031" + "E0C080" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Improperly terminated 3-byte sequence at start of DWORD + [InlineData("3031" + "E17F80" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Improperly terminated 3-byte sequence at start of DWORD + [InlineData("3031" + "E1C080" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Improperly terminated 3-byte sequence at start of DWORD + [InlineData("3031" + "EDA080" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Surrogate 3-byte sequence at start of DWORD + [InlineData("3031" + "F5808080", 2, "01")] // [ F5 ] is always invalid + [InlineData("3031" + "F6808080", 2, "01")] // [ F6 ] is always invalid + [InlineData("3031" + "F7808080", 2, "01")] // [ F7 ] is always invalid + [InlineData("3031" + "F8808080", 2, "01")] // [ F8 ] is always invalid + [InlineData("3031" + "F9808080", 2, "01")] // [ F9 ] is always invalid + [InlineData("3031" + "FA808080", 2, "01")] // [ FA ] is always invalid + [InlineData("3031" + "FB808080", 2, "01")] // [ FB ] is always invalid + [InlineData("3031" + "FC808080", 2, "01")] // [ FC ] is always invalid + [InlineData("3031" + "FD808080", 2, "01")] // [ FD ] is always invalid + [InlineData("3031" + "FE808080", 2, "01")] // [ FE ] is always invalid + [InlineData("3031" + "FF808080", 2, "01")] // [ FF ] is always invalid + public void ToChars_WithLargeInvalidBuffers(string utf8HexInput, int expectedNumBytesConsumed, string expectedUtf16Transcoding) + { + // These test cases are for the "fast processing" code which is the main loop of TranscodeToUtf16, + // so inputs should be less >= 4 bytes. + + Assert.True(utf8HexInput.Length >= 8); + + ToChars_Test_Core( + utf8Input: DecodeHex(utf8HexInput), + destinationSize: expectedUtf16Transcoding.Length, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.InvalidData, + expectedNumBytesRead: expectedNumBytesConsumed, + expectedUtf16Transcoding: expectedUtf16Transcoding); + } + + [Theory] + [InlineData(X_UTF8 + "80" + X_UTF8, X_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // stray continuation byte [ 80 ] + [InlineData(X_UTF8 + "FF" + X_UTF8, X_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // invalid UTF-8 byte [ FF ] + [InlineData(X_UTF8 + "C2" + X_UTF8, X_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // 2-byte sequence starter [ C2 ] not followed by continuation byte + [InlineData(X_UTF8 + "C1C180" + X_UTF8, X_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // [ C1 80 ] is overlong but consists of two maximal invalid subsequences, each of length 1 byte + [InlineData(X_UTF8 + E_ACUTE_UTF8 + "E08080", X_UTF16 + E_ACUTE_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16)] // [ E0 80 ] is overlong 2-byte sequence (1 byte maximal invalid subsequence), and following [ 80 ] is stray continuation byte + [InlineData(GRINNING_FACE_UTF8 + "F08F8080" + GRINNING_FACE_UTF8, GRINNING_FACE_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + GRINNING_FACE_UTF16)] // [ F0 8F ] is overlong 4-byte sequence (1 byte maximal invalid subsequence), and following [ 80 ] instances are stray continuation bytes + [InlineData(GRINNING_FACE_UTF8 + "F4908080" + GRINNING_FACE_UTF8, GRINNING_FACE_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + GRINNING_FACE_UTF16)] // [ F4 90 ] is out-of-range 4-byte sequence (1 byte maximal invalid subsequence), and following [ 80 ] instances are stray continuation bytes + [InlineData(E_ACUTE_UTF8 + "EDA0" + X_UTF8, E_ACUTE_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // [ ED A0 ] is encoding of UTF-16 surrogate code point, so consists of two maximal invalid subsequences, each of length 1 byte + [InlineData(E_ACUTE_UTF8 + "ED80" + X_UTF8, E_ACUTE_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // [ ED 80 ] is incomplete 3-byte sequence, so is 2-byte maximal invalid subsequence + [InlineData(E_ACUTE_UTF8 + "F380" + X_UTF8, E_ACUTE_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // [ F3 80 ] is incomplete 4-byte sequence, so is 2-byte maximal invalid subsequence + [InlineData(E_ACUTE_UTF8 + "F38080" + X_UTF8, E_ACUTE_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // [ F3 80 80 ] is incomplete 4-byte sequence, so is 3-byte maximal invalid subsequence + public void ToChars_WithReplacement(string utf8HexInput, string expectedUtf16Transcoding) + { + // First run the test with isFinalBlock = false, + // both with and without some bytes of incomplete trailing data. + + ToChars_Test_Core( + utf8Input: DecodeHex(utf8HexInput), + destinationSize: expectedUtf16Transcoding.Length, + replaceInvalidSequences: true, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.Done, + expectedNumBytesRead: utf8HexInput.Length / 2, + expectedUtf16Transcoding: expectedUtf16Transcoding); + + ToChars_Test_Core( + utf8Input: DecodeHex(utf8HexInput + "E0BF" /* trailing data */), + destinationSize: expectedUtf16Transcoding.Length, + replaceInvalidSequences: true, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.NeedMoreData, + expectedNumBytesRead: utf8HexInput.Length / 2, + expectedUtf16Transcoding: expectedUtf16Transcoding); + + // Then run the test with isFinalBlock = true, with incomplete trailing data. + + ToChars_Test_Core( + utf8Input: DecodeHex(utf8HexInput + "E0BF" /* trailing data */), + destinationSize: expectedUtf16Transcoding.Length, + replaceInvalidSequences: true, + isFinalChunk: true, + expectedOperationStatus: OperationStatus.DestinationTooSmall, + expectedNumBytesRead: utf8HexInput.Length / 2, + expectedUtf16Transcoding: expectedUtf16Transcoding); + + ToChars_Test_Core( + utf8Input: DecodeHex(utf8HexInput + "E0BF" /* trailing data */), + destinationSize: expectedUtf16Transcoding.Length + 1, // allow room for U+FFFD + replaceInvalidSequences: true, + isFinalChunk: true, + expectedOperationStatus: OperationStatus.Done, + expectedNumBytesRead: utf8HexInput.Length / 2 + 2, + expectedUtf16Transcoding: expectedUtf16Transcoding + REPLACEMENT_CHAR_UTF16); + } + + [Fact] + public void ToChars_AllPossibleScalarValues() + { + ToChars_Test_Core( + utf8Input: s_allScalarsAsUtf8.Span, + destinationSize: s_allScalarsAsUtf16.Length, + replaceInvalidSequences: false, + isFinalChunk: false, + expectedOperationStatus: OperationStatus.Done, + expectedNumBytesRead: s_allScalarsAsUtf8.Length, + expectedUtf16Transcoding: s_allScalarsAsUtf16.Span); + } + + private static void ToChars_Test_Core(ReadOnlySpan utf8Input, int destinationSize, bool replaceInvalidSequences, bool isFinalChunk, OperationStatus expectedOperationStatus, int expectedNumBytesRead, ReadOnlySpan expectedUtf16Transcoding) + { + // Arrange + + using (BoundedMemory boundedSource = BoundedMemory.AllocateFromExistingData(utf8Input)) + using (BoundedMemory boundedDestination = BoundedMemory.Allocate(destinationSize)) + { + boundedSource.MakeReadonly(); + + // Act + + OperationStatus actualOperationStatus = Utf8.ToUtf16(boundedSource.Span, boundedDestination.Span, out int actualNumBytesRead, out int actualNumCharsWritten, replaceInvalidSequences, isFinalChunk); + + // Assert + + Assert.Equal(expectedOperationStatus, actualOperationStatus); + Assert.Equal(expectedNumBytesRead, actualNumBytesRead); + Assert.Equal(expectedUtf16Transcoding.Length, actualNumCharsWritten); + Assert.Equal(expectedUtf16Transcoding.ToString(), boundedDestination.Span.Slice(0, actualNumCharsWritten).ToString()); + } + } + } +} diff --git a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.netcoreapp.cs b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.netcoreapp.cs new file mode 100644 index 000000000000..087235a81b74 --- /dev/null +++ b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.netcoreapp.cs @@ -0,0 +1,141 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Buffers; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Text.RegularExpressions; +using Xunit; + +namespace System.Text.Unicode.Tests +{ + public partial class Utf8Tests + { + private const string X_UTF8 = "58"; // U+0058 LATIN CAPITAL LETTER X, 1 byte + private const string X_UTF16 = "X"; + + private const string Y_UTF8 = "59"; // U+0058 LATIN CAPITAL LETTER Y, 1 byte + private const string Y_UTF16 = "Y"; + + private const string Z_UTF8 = "5A"; // U+0058 LATIN CAPITAL LETTER Z, 1 byte + private const string Z_UTF16 = "Z"; + + private const string E_ACUTE_UTF8 = "C3A9"; // U+00E9 LATIN SMALL LETTER E WITH ACUTE, 2 bytes + private const string E_ACUTE_UTF16 = "\u00E9"; + + private const string EURO_SYMBOL_UTF8 = "E282AC"; // U+20AC EURO SIGN, 3 bytes + private const string EURO_SYMBOL_UTF16 = "\u20AC"; + + private const string REPLACEMENT_CHAR_UTF8 = "EFBFBD"; // U+FFFD REPLACEMENT CHAR, 3 bytes + private const string REPLACEMENT_CHAR_UTF16 = "\uFFFD"; + + private const string GRINNING_FACE_UTF8 = "F09F9880"; // U+1F600 GRINNING FACE, 4 bytes + private const string GRINNING_FACE_UTF16 = "\U0001F600"; + + // All valid scalars [ U+0000 .. U+D7FF ] and [ U+E000 .. U+10FFFF ]. + private static readonly IEnumerable s_allValidScalars = Enumerable.Range(0x0000, 0xD800).Concat(Enumerable.Range(0xE000, 0x110000 - 0xE000)).Select(value => new Rune(value)); + + private static readonly ReadOnlyMemory s_allScalarsAsUtf16; + private static readonly ReadOnlyMemory s_allScalarsAsUtf8; + + static Utf8Tests() + { + List allScalarsAsUtf16 = new List(); + List allScalarsAsUtf8 = new List(); + + foreach (Rune rune in s_allValidScalars) + { + allScalarsAsUtf16.AddRange(ToUtf16(rune)); + allScalarsAsUtf8.AddRange(ToUtf8(rune)); + } + + s_allScalarsAsUtf16 = allScalarsAsUtf16.ToArray().AsMemory(); + s_allScalarsAsUtf8 = allScalarsAsUtf8.ToArray().AsMemory(); + } + + /* + * COMMON UTILITIES FOR UNIT TESTS + */ + + private static byte[] DecodeHex(ReadOnlySpan inputHex) + { + Assert.True(Regex.IsMatch(inputHex.ToString(), "^([0-9a-fA-F]{2})*$"), "Input must be an even number of hex characters."); + + byte[] retVal = new byte[inputHex.Length / 2]; + for (int i = 0; i < retVal.Length; i++) + { + retVal[i] = byte.Parse(inputHex.Slice(i * 2, 2), NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture); + } + return retVal; + } + + // !! IMPORTANT !! + // Don't delete this implementation, as we use it as a reference to make sure the framework's + // transcoding logic is correct. + private static byte[] ToUtf8(Rune rune) + { + Assert.True(Rune.IsValid(rune.Value), $"Rune with value U+{(uint)rune.Value:X4} is not well-formed."); + + if (rune.Value < 0x80) + { + return new[] + { + (byte)rune.Value + }; + } + else if (rune.Value < 0x0800) + { + return new[] + { + (byte)((rune.Value >> 6) | 0xC0), + (byte)((rune.Value & 0x3F) | 0x80) + }; + } + else if (rune.Value < 0x10000) + { + return new[] + { + (byte)((rune.Value >> 12) | 0xE0), + (byte)(((rune.Value >> 6) & 0x3F) | 0x80), + (byte)((rune.Value & 0x3F) | 0x80) + }; + } + else + { + return new[] + { + (byte)((rune.Value >> 18) | 0xF0), + (byte)(((rune.Value >> 12) & 0x3F) | 0x80), + (byte)(((rune.Value >> 6) & 0x3F) | 0x80), + (byte)((rune.Value & 0x3F) | 0x80) + }; + } + } + + // !! IMPORTANT !! + // Don't delete this implementation, as we use it as a reference to make sure the framework's + // transcoding logic is correct. + private static char[] ToUtf16(Rune rune) + { + Assert.True(Rune.IsValid(rune.Value), $"Rune with value U+{(uint)rune.Value:X4} is not well-formed."); + + if (rune.IsBmp) + { + return new[] + { + (char)rune.Value + }; + } + else + { + return new[] + { + (char)((rune.Value >> 10) + 0xD800 - 0x40), + (char)((rune.Value & 0x03FF) + 0xDC00) + }; + } + } + } +}