Skip to content

Commit

Permalink
Add support for surrogate pair capitalization
Browse files Browse the repository at this point in the history
  • Loading branch information
eiriktsarpalis committed Aug 10, 2023
1 parent 16bd58a commit c1c5d1c
Show file tree
Hide file tree
Showing 2 changed files with 177 additions and 23 deletions.
163 changes: 144 additions & 19 deletions src/libraries/System.Text.Json/Common/JsonSeparatorNamingPolicy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ namespace System.Text.Json
internal abstract class JsonSeparatorNamingPolicy : JsonNamingPolicy
{
private readonly bool _lowercase;
private readonly char _separator;
private readonly Rune _separator;

internal JsonSeparatorNamingPolicy(bool lowercase, char separator)
{
Debug.Assert(char.IsPunctuation(separator));

_lowercase = lowercase;
_separator = separator;
_separator = new Rune(separator);
}

public sealed override string ConvertName(string name)
Expand All @@ -31,7 +31,7 @@ public sealed override string ConvertName(string name)
return ConvertNameCore(_separator, _lowercase, name);
}

private static string ConvertNameCore(char separator, bool lowercase, string name)
private static string ConvertNameCore(Rune separator, bool lowercase, string name)
{
Debug.Assert(name != null);

Expand All @@ -44,15 +44,15 @@ private static string ConvertNameCore(char separator, bool lowercase, string nam
? stackalloc char[JsonConstants.StackallocCharThreshold]
: (rentedBuffer = ArrayPool<char>.Shared.Rent(initialBufferLength));

ReadOnlySpan<char> chars = name.AsSpan();
SeparatorState state = SeparatorState.NotStarted;
int charsWritten = 0;

for (int i = 0; i < chars.Length; i++)
for (int i = 0; i < name.Length;)
{
char current = chars[i];
Rune current = Rune.GetRuneAt(name, i);
int charLength = current.Utf16SequenceLength;

switch (char.GetUnicodeCategory(current))
switch (Rune.GetUnicodeCategory(current))
{
case UnicodeCategory.UppercaseLetter:

Expand All @@ -65,7 +65,7 @@ private static string ConvertNameCore(char separator, bool lowercase, string nam
case SeparatorState.SpaceSeparator:
// An uppercase letter following a sequence of lowercase letters or spaces
// denotes the start of a new grouping: emit a separator character.
WriteChar(separator, ref destination);
Write(separator, ref destination);
break;

case SeparatorState.UppercaseLetter:
Expand All @@ -74,10 +74,15 @@ private static string ConvertNameCore(char separator, bool lowercase, string nam
// final letter, assuming it is followed by lowercase letters.
// For example, the value 'XMLReader' should render as 'xml_reader',
// however 'SHA512Hash' should render as 'sha512-hash'.
if (i + 1 < chars.Length && char.IsLower(chars[i + 1]))
if (i + charLength < name.Length)
{
WriteChar(separator, ref destination);
Rune next = Rune.GetRuneAt(name, i + charLength);
if (Rune.GetUnicodeCategory(next) is UnicodeCategory.LowercaseLetter)
{
Write(separator, ref destination);
}
}

break;

default:
Expand All @@ -86,9 +91,11 @@ private static string ConvertNameCore(char separator, bool lowercase, string nam
}

if (lowercase)
current = char.ToLowerInvariant(current);
{
current = Rune.ToLowerInvariant(current);
}

WriteChar(current, ref destination);
Write(current, ref destination);
state = SeparatorState.UppercaseLetter;
break;

Expand All @@ -98,13 +105,15 @@ private static string ConvertNameCore(char separator, bool lowercase, string nam
if (state is SeparatorState.SpaceSeparator)
{
// Normalize preceding spaces to one separator.
WriteChar(separator, ref destination);
Write(separator, ref destination);
}

if (!lowercase)
current = char.ToUpperInvariant(current);
{
current = Rune.ToUpperInvariant(current);
}

WriteChar(current, ref destination);
Write(current, ref destination);
state = SeparatorState.LowercaseLetterOrDigit;
break;

Expand All @@ -122,10 +131,12 @@ private static string ConvertNameCore(char separator, bool lowercase, string nam
// are written as-is to the output and reset the separator state.
// E.g. 'ABC???def' maps to 'abc???def' in snake_case.

WriteChar(current, ref destination);
Write(current, ref destination);
state = SeparatorState.NotStarted;
break;
}

i += charLength;
}

name = destination.Slice(0, charsWritten).ToString();
Expand All @@ -139,14 +150,16 @@ private static string ConvertNameCore(char separator, bool lowercase, string nam
return name;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
void WriteChar(char value, ref Span<char> destination)
void Write(Rune rune, ref Span<char> destination)
{
if (charsWritten == destination.Length)
if (charsWritten + 2 > destination.Length)
{
ExpandBuffer(ref destination);
}

destination[charsWritten++] = value;
int written = rune.EncodeToUtf16(destination.Slice(charsWritten));
Debug.Assert(written == rune.Utf16SequenceLength);
charsWritten += written;
}

void ExpandBuffer(ref Span<char> destination)
Expand All @@ -166,6 +179,118 @@ void ExpandBuffer(ref Span<char> destination)
}
}

#if !NETCOREAPP
// Provides a basic Rune polyfill that handles surrogate pairs
// TODO remove once https://github.com/dotnet/runtime/issues/52947 is complete.
private readonly struct Rune
{
private readonly char _first;
private readonly char? _lowSurrogate;
private readonly UnicodeCategory _category;

public int Utf16SequenceLength => _lowSurrogate.HasValue ? 2 : 1;

public static UnicodeCategory GetUnicodeCategory(Rune rune)
=> rune._category;

public Rune(char ch) : this(ch, char.GetUnicodeCategory(ch))
{
}

private Rune(char ch, UnicodeCategory category)
{
Debug.Assert(!char.IsSurrogate(ch));
_first = ch;
_category = category;
}

private Rune(char highSurrogate, char lowSurrogate, UnicodeCategory category)
{
Debug.Assert(char.IsSurrogatePair(highSurrogate, lowSurrogate));
_first = highSurrogate;
_lowSurrogate = lowSurrogate;
_category = category;
}

public static Rune ToLowerInvariant(Rune value)
{
UnicodeCategory category = value._category;
if (category is UnicodeCategory.UppercaseLetter)
{
category = UnicodeCategory.LowercaseLetter;
}

if (value._lowSurrogate is not char lowSurrogate)
{
return new Rune(char.ToLowerInvariant(value._first), category);
}

ReadOnlySpan<char> source = stackalloc char[] { value._first, lowSurrogate };
Span<char> destination = stackalloc char[2];

source.ToLowerInvariant(destination);
return new Rune(destination[0], destination[1], category);
}

public static Rune ToUpperInvariant(Rune value)
{
UnicodeCategory category = value._category;
if (category is UnicodeCategory.LowercaseLetter)
{
category = UnicodeCategory.UppercaseLetter;
}

if (value._lowSurrogate is not char lowSurrogate)
{
return new Rune(char.ToUpperInvariant(value._first), category);
}

ReadOnlySpan<char> source = stackalloc char[] { value._first, lowSurrogate };
Span<char> destination = stackalloc char[2];

source.ToUpperInvariant(destination);
return new Rune(destination[0], destination[1], category);
}

public int EncodeToUtf16(Span<char> destination)
{
Debug.Assert(Utf16SequenceLength <= destination.Length);
destination[0] = _first;

if (_lowSurrogate is not char lowSurrogate)
{
return 1;
}

destination[1] = lowSurrogate;
return 2;
}

public static Rune GetRuneAt(string input, int index)
{
char first = input[index];
UnicodeCategory category = char.GetUnicodeCategory(first);
if (category is UnicodeCategory.Surrogate)
{
char lowSurrogate = default;
if (index + 1 == input.Length ||
!char.IsSurrogatePair(first, lowSurrogate = input[index + 1]))
{
// CharUnicodeInfo.GetUnicodeCategory does
// not throw so we throw here instead.
ThrowArgumentException();

static void ThrowArgumentException() => throw new ArgumentException(nameof(input));
}

category = CharUnicodeInfo.GetUnicodeCategory(input, index);
return new Rune(first, lowSurrogate, category);
}

return new Rune(first, category);
}
}
#endif
private enum SeparatorState
{
NotStarted,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ namespace System.Text.Json.Serialization.Tests
public static class NamingPolicyUnitTests
{
private readonly static CamelCaseNamingStrategy s_newtonsoftCamelCaseNamingStrategy = new();

[Theory]
// These test cases were copied from Json.NET.
[InlineData("urlValue", "URLValue")]
Expand Down Expand Up @@ -65,7 +66,6 @@ public static void CamelCaseNullNameReturnsNull()
[InlineData("i18n", "i18n")]
[InlineData("i18n_policy", "I18nPolicy")]
[InlineData("7samurai", "7samurai")]
[InlineData("άλφα_βήτα_γάμμα", "ΆλφαΒήταΓάμμα")]
[InlineData("camel_case", "camelCase")]
[InlineData("camel_case", "CamelCase")]
[InlineData("snake_case", "snake_case")]
Expand Down Expand Up @@ -95,6 +95,10 @@ public static void CamelCaseNullNameReturnsNull()
[InlineData("abc_7ef", " abc 7ef ")]
[InlineData("ab7_def", " ab7 def ")]
[InlineData("_abc", "_abc")]
[InlineData("", "")]
[InlineData("😀", "😀")] // Surrogate pairs
[InlineData("άλφα_βήτα_γάμμα", "ΆλφαΒήταΓάμμα")] // Non-ascii letters
[InlineData("𐐨𐐨𐐨_𐐨𐐨𐐨", "𐐀𐐨𐐨𐐀𐐨𐐨")] // Surrogate pair letters
[InlineData("a%", "a%")]
[InlineData("_?#-", "_?#-")]
[InlineData("?!?", "? ! ?")]
Expand Down Expand Up @@ -128,7 +132,6 @@ public static void ToSnakeLowerCase(string expectedResult, string name)
[InlineData("I18N", "i18n")]
[InlineData("I18N_POLICY", "I18nPolicy")]
[InlineData("7SAMURAI", "7samurai")]
[InlineData("ΆΛΦΑ_ΒΉΤΑ_ΓΆΜΜΑ", "ΆλφαΒήταΓάμμα")]
[InlineData("CAMEL_CASE", "camelCase")]
[InlineData("CAMEL_CASE", "CamelCase")]
[InlineData("SNAKE_CASE", "snake_case")]
Expand Down Expand Up @@ -158,6 +161,10 @@ public static void ToSnakeLowerCase(string expectedResult, string name)
[InlineData("ABC_7EF", " abc 7ef ")]
[InlineData("AB7_DEF", " ab7 def ")]
[InlineData("_ABC", "_abc")]
[InlineData("", "")]
[InlineData("😀", "😀")] // Surrogate pairs
[InlineData("ΆΛΦΑ_ΒΉΤΑ_ΓΆΜΜΑ", "ΆλφαΒήταΓάμμα")] // Non-ascii letters
[InlineData("𐐀𐐀𐐀_𐐀𐐀𐐀", "𐐀𐐨𐐨𐐀𐐨𐐨")] // Surrogate pair letters
[InlineData("A%", "a%")]
[InlineData("_?#-", "_?#-")]
[InlineData("?!?", "? ! ?")]
Expand Down Expand Up @@ -191,7 +198,6 @@ public static void ToSnakeUpperCase(string expectedResult, string name)
[InlineData("i18n", "i18n")]
[InlineData("i18n-policy", "I18nPolicy")]
[InlineData("7samurai", "7samurai")]
[InlineData("άλφα-βήτα-γάμμα", "ΆλφαΒήταΓάμμα")]
[InlineData("camel-case", "camelCase")]
[InlineData("camel-case", "CamelCase")]
[InlineData("snake_case", "snake_case")]
Expand Down Expand Up @@ -222,6 +228,10 @@ public static void ToSnakeUpperCase(string expectedResult, string name)
[InlineData("abc-7ef", " abc 7ef ")]
[InlineData("ab7-def", " ab7 def ")]
[InlineData("-abc", "-abc")]
[InlineData("", "")]
[InlineData("😀", "😀")] // Surrogate pairs
[InlineData("άλφα-βήτα-γάμμα", "ΆλφαΒήταΓάμμα")] // Non-ascii letters
[InlineData("𐐨𐐨𐐨-𐐨𐐨𐐨", "𐐀𐐨𐐨𐐀𐐨𐐨")] // Surrogate pair letters
[InlineData("a%", "a%")]
[InlineData("-?#_", "-?#_")]
[InlineData("?!?", "? ! ?")]
Expand Down Expand Up @@ -255,7 +265,6 @@ public static void ToKebabLowerCase(string expectedResult, string name)
[InlineData("I18N", "i18n")]
[InlineData("I18N-POLICY", "I18nPolicy")]
[InlineData("7SAMURAI", "7samurai")]
[InlineData("ΆΛΦΑ-ΒΉΤΑ-ΓΆΜΜΑ", "ΆλφαΒήταΓάμμα")]
[InlineData("CAMEL-CASE", "camelCase")]
[InlineData("CAMEL-CASE", "CamelCase")]
[InlineData("SNAKE_CASE", "snake_case")]
Expand Down Expand Up @@ -286,6 +295,10 @@ public static void ToKebabLowerCase(string expectedResult, string name)
[InlineData("ABC-7EF", " abc 7ef ")]
[InlineData("AB7-DEF", " ab7 def ")]
[InlineData("-ABC", "-abc")]
[InlineData("", "")]
[InlineData("😀", "😀")] // Surrogate pairs
[InlineData("ΆΛΦΑ-ΒΉΤΑ-ΓΆΜΜΑ", "ΆλφαΒήταΓάμμα")] // Non-ascii letters
[InlineData("𐐀𐐀𐐀-𐐀𐐀𐐀", "𐐀𐐨𐐨𐐀𐐨𐐨")] // Surrogate pair letters
[InlineData("A%", "a%")]
[InlineData("-?#_", "-?#_")]
[InlineData("?!?", "? ! ?")]
Expand Down Expand Up @@ -313,6 +326,22 @@ public static void ToKebabUpperCase(string expectedResult, string name)
Assert.Equal(expectedResult, value);
}

[Fact]
public static void SnakeCasePolicy_MissingSurrogatePair_ThrowsArgumentException()
{
string value = "xyz\ud83d";
Assert.Throws<ArgumentException>(() => JsonNamingPolicy.SnakeCaseLower.ConvertName(value));
Assert.Throws<ArgumentException>(() => JsonNamingPolicy.SnakeCaseUpper.ConvertName(value));
}

[Fact]
public static void KebabCasePolicy_MissingSurrogatePair_ThrowsArgumentException()
{
string value = "xyz\ud83d";
Assert.Throws<ArgumentException>(() => JsonNamingPolicy.KebabCaseLower.ConvertName(value));
Assert.Throws<ArgumentException>(() => JsonNamingPolicy.KebabCaseUpper.ConvertName(value));
}

[Theory, OuterLoop]
[MemberData(nameof(GetValidMemberNames))]
public static void CamelCaseNamingPolicyMatchesNewtonsoftNamingStrategy(string name)
Expand Down

0 comments on commit c1c5d1c

Please sign in to comment.