diff --git a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems index 8768d19223d4..f209d954bedb 100644 --- a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems +++ b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems @@ -652,8 +652,10 @@ + + diff --git a/src/System.Private.CoreLib/shared/System/MemoryExtensions.cs b/src/System.Private.CoreLib/shared/System/MemoryExtensions.cs index 6521a5af2e31..6145801faff3 100644 --- a/src/System.Private.CoreLib/shared/System/MemoryExtensions.cs +++ b/src/System.Private.CoreLib/shared/System/MemoryExtensions.cs @@ -5,6 +5,7 @@ using System.Collections.Generic; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Text; using Internal.Runtime.CompilerServices; @@ -975,6 +976,28 @@ ref MemoryMarshal.GetReference(value), valueLength); } + /// + /// Returns an enumeration of from the provided span. + /// + /// + /// Invalid sequences will be represented in the enumeration by . + /// + public static SpanRuneEnumerator EnumerateRunes(this ReadOnlySpan span) + { + return new SpanRuneEnumerator(span); + } + + /// + /// Returns an enumeration of from the provided span. + /// + /// + /// Invalid sequences will be represented in the enumeration by . + /// + public static SpanRuneEnumerator EnumerateRunes(this Span span) + { + return new SpanRuneEnumerator(span); + } + /// /// Reverses the sequence of the elements in the entire span. /// diff --git a/src/System.Private.CoreLib/shared/System/String.cs b/src/System.Private.CoreLib/shared/System/String.cs index 7050644d9aac..366b678a22ac 100644 --- a/src/System.Private.CoreLib/shared/System/String.cs +++ b/src/System.Private.CoreLib/shared/System/String.cs @@ -532,6 +532,17 @@ IEnumerator IEnumerable.GetEnumerator() return new CharEnumerator(this); } + /// + /// Returns an enumeration of from this string. + /// + /// + /// Invalid sequences will be represented in the enumeration by . + /// + public StringRuneEnumerator EnumerateRunes() + { + return new StringRuneEnumerator(this); + } + internal static unsafe int wcslen(char* ptr) { char* end = ptr; diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs index a4ef3a37b731..d405b69b2138 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Rune.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Rune.cs @@ -249,6 +249,43 @@ public static Rune GetRuneAt(string input, int index) [CLSCompliant(false)] public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value); + // returns a negative number on failure + internal static int ReadFirstRuneFromUtf16Buffer(ReadOnlySpan input) + { + if (input.IsEmpty) + { + return -1; + } + + // Optimistically assume input is within BMP. + + uint returnValue = input[0]; + if (UnicodeUtility.IsSurrogateCodePoint(returnValue)) + { + if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue)) + { + return -1; + } + + // Treat 'returnValue' as the high surrogate. + + if (1 >= (uint)input.Length) + { + return -1; // not an argument exception - just a "bad data" failure + } + + uint potentialLowSurrogate = input[1]; + if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate)) + { + return -1; + } + + returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate); + } + + return (int)returnValue; + } + // returns a negative number on failure private static int ReadRuneFromString(string input, int index) { diff --git a/src/System.Private.CoreLib/shared/System/Text/SpanRuneEnumerator.cs b/src/System.Private.CoreLib/shared/System/Text/SpanRuneEnumerator.cs new file mode 100644 index 000000000000..082a5108c140 --- /dev/null +++ b/src/System.Private.CoreLib/shared/System/Text/SpanRuneEnumerator.cs @@ -0,0 +1,51 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace System.Text +{ + // An enumerator for retrieving System.Text.Rune instances from a ROS. + // Methods are pattern-matched by compiler to allow using foreach pattern. + public ref struct SpanRuneEnumerator + { + private ReadOnlySpan _remaining; + private Rune _current; + + internal SpanRuneEnumerator(ReadOnlySpan buffer) + { + _remaining = buffer; + _current = default; + } + + public Rune Current => _current; + + public SpanRuneEnumerator GetEnumerator() => this; + + public bool MoveNext() + { + if (_remaining.IsEmpty) + { + // reached the end of the buffer + _current = default; + return false; + } + + int scalarValue = Rune.ReadFirstRuneFromUtf16Buffer(_remaining); + if (scalarValue < 0) + { + // replace invalid sequences with U+FFFD + scalarValue = Rune.ReplacementChar.Value; + } + + // In UTF-16 specifically, invalid sequences always have length 1, which is the same + // length as the replacement character U+FFFD. This means that we can always bump the + // next index by the current scalar's UTF-16 sequence length. This optimization is not + // generally applicable; for example, enumerating scalars from UTF-8 cannot utilize + // this same trick. + + _current = Rune.UnsafeCreate((uint)scalarValue); + _remaining = _remaining.Slice(_current.Utf16SequenceLength); + return true; + } + } +} diff --git a/src/System.Private.CoreLib/shared/System/Text/StringRuneEnumerator.cs b/src/System.Private.CoreLib/shared/System/Text/StringRuneEnumerator.cs new file mode 100644 index 000000000000..fe12dfa4f74b --- /dev/null +++ b/src/System.Private.CoreLib/shared/System/Text/StringRuneEnumerator.cs @@ -0,0 +1,70 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections; +using System.Collections.Generic; + +namespace System.Text +{ + // An enumerator for retrieving System.Text.Rune instances from a System.String. + public struct StringRuneEnumerator : IEnumerable, IEnumerator + { + private readonly string _string; + private Rune _current; + private int _nextIndex; + + internal StringRuneEnumerator(string value) + { + _string = value; + _current = default; + _nextIndex = 0; + } + + public Rune Current => _current; + + public StringRuneEnumerator GetEnumerator() => this; + + public bool MoveNext() + { + if ((uint)_nextIndex >= _string.Length) + { + // reached the end of the string + _current = default; + return false; + } + + if (!Rune.TryGetRuneAt(_string, _nextIndex, out _current)) + { + // replace invalid sequences with U+FFFD + _current = Rune.ReplacementChar; + } + + // In UTF-16 specifically, invalid sequences always have length 1, which is the same + // length as the replacement character U+FFFD. This means that we can always bump the + // next index by the current scalar's UTF-16 sequence length. This optimization is not + // generally applicable; for example, enumerating scalars from UTF-8 cannot utilize + // this same trick. + + _nextIndex += _current.Utf16SequenceLength; + return true; + } + + object IEnumerator.Current => _current; + + void IDisposable.Dispose() + { + // no-op + } + + IEnumerator IEnumerable.GetEnumerator() => this; + + IEnumerator IEnumerable.GetEnumerator() => this; + + void IEnumerator.Reset() + { + _current = default; + _nextIndex = 0; + } + } +}