Skip to content

Commit

Permalink
Fix string comparison with ordinal casing with Surrogates (#55771)
Browse files Browse the repository at this point in the history
* Fix string comparison with ordinal casing with Surrogates

* Address the feedback
  • Loading branch information
tarekgh authored Jul 16, 2021
1 parent 04a8f5e commit 1d065b6
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 46 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,20 @@ public static IEnumerable<object[]> Compare_TestData()
yield return new object[] { new CultureInfo("de-DE_phoneb").CompareInfo, "\u00DC", "UE", CompareOptions.None, useNls ? 0 : -1 };
yield return new object[] { new CultureInfo("es-ES_tradnl").CompareInfo, "llegar", "lugar", CompareOptions.None, useNls ? 1 : -1 };
}

//
// Ordinal comparisons with ignore casing.
//

yield return new object[] { s_invariantCompare, "abcd", "abcd", CompareOptions.OrdinalIgnoreCase, 0};
yield return new object[] { s_invariantCompare, "abcd", "ABCD", CompareOptions.OrdinalIgnoreCase, 0};
yield return new object[] { s_invariantCompare, "Hello\u00F6", "HELLO\u00D6", CompareOptions.OrdinalIgnoreCase, 0};
yield return new object[] { s_invariantCompare, "Hello\uFE6A", "Hello\U0001F601", CompareOptions.OrdinalIgnoreCase, useNls ? 1 : -1};
yield return new object[] { s_invariantCompare, "Hello\U0001F601", "Hello\uFE6A", CompareOptions.OrdinalIgnoreCase, useNls ? -1 : 1};
yield return new object[] { s_invariantCompare, "\uDBFF", "\uD800\uDC00", CompareOptions.OrdinalIgnoreCase, useNls ? 1 : -1};
yield return new object[] { s_invariantCompare, "\uD800\uDC00", "\uDBFF", CompareOptions.OrdinalIgnoreCase, useNls ? -1 : 1};
yield return new object[] { s_invariantCompare, "abcdefg\uDBFF", "abcdefg\uD800\uDC00", CompareOptions.OrdinalIgnoreCase, useNls ? 1 : -1};
yield return new object[] { s_invariantCompare, "\U00010400", "\U00010428", CompareOptions.OrdinalIgnoreCase, useNls ? -1 : 0};
}

// There is a regression in Windows 190xx version with the Kana comparison. Avoid running this test there.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,15 @@ internal static void ToLower(ReadOnlySpan<char> source, Span<char> destination)
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static (uint, int) GetScalar(ref char charA, int index, int length)
private static (uint, int) GetScalar(ref char source, int index, int length)
{
char charA = source;
if (!char.IsHighSurrogate(charA) || index >= length - 1)
{
return ((uint)charA, 1);
}

ref char charB = ref Unsafe.Add(ref charA, 1);
char charB = Unsafe.Add(ref source, 1);
if (!char.IsLowSurrogate(charB))
{
return ((uint)charA, 1);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Text;
using System.Diagnostics;
using System.Threading;
using System.Runtime.InteropServices;
Expand Down Expand Up @@ -196,72 +197,93 @@ internal static int CompareStringIgnoreCase(ref char strA, int lengthA, ref char
ref char charA = ref strA;
ref char charB = ref strB;

while (length != 0)
int index = 0;

while (index < length)
{
// optimize for Ascii cases
if (charA <= '\u00FF' || length == 1 || !char.IsHighSurrogate(charA) || !char.IsHighSurrogate(charB))
char a = charA;
char b = charB;
char lowSurrogateA = '\0';

if (!char.IsHighSurrogate(a) || index >= lengthA - 1 || !char.IsLowSurrogate(lowSurrogateA = Unsafe.Add(ref charA, 1)))
{
if (charA == charB)
if (!char.IsHighSurrogate(b) || index >= lengthB - 1 || !char.IsLowSurrogate(Unsafe.Add(ref charB, 1)))
{
length--;
charA = ref Unsafe.Add(ref charA, 1);
charB = ref Unsafe.Add(ref charB, 1);
continue;
}
//
// Neither A or B are surrogates
//

char aUpper = OrdinalCasing.ToUpper(charA);
char bUpper = OrdinalCasing.ToUpper(charB);
if (b == a)
{
index++;
charA = ref Unsafe.Add(ref charA, 1);
charB = ref Unsafe.Add(ref charB, 1);
continue;
}

if (aUpper == bUpper)
{
length--;
charA = ref Unsafe.Add(ref charA, 1);
charB = ref Unsafe.Add(ref charB, 1);
continue;
char aUpper = OrdinalCasing.ToUpper(a);
char bUpper = OrdinalCasing.ToUpper(b);

if (aUpper == bUpper)
{
index++;
charA = ref Unsafe.Add(ref charA, 1);
charB = ref Unsafe.Add(ref charB, 1);
continue;
}

return a - b;
}

return aUpper - bUpper;
//
// charA is not surrogate and charB is valid surrogate
//

return -1;
}

// We come here only of we have valid high surrogates and length > 1
//
// A is Surrogate
//

char a = charA;
char b = charB;

length--;
charA = ref Unsafe.Add(ref charA, 1);
charB = ref Unsafe.Add(ref charB, 1);
char lowSurrogateB = '\0';

if (!char.IsLowSurrogate(charA) || !char.IsLowSurrogate(charB))
if (!char.IsHighSurrogate(b) || index >= lengthB - 1 || !char.IsLowSurrogate(lowSurrogateB = Unsafe.Add(ref charB, 1)))
{
// malformed Surrogates - should be rare cases
if (a != b)
{
return a - b;
}
//
// charB is not surrogate and charA is surrogate
//

// Should be pointing to the right characters in the string to resume at.
// Just in case we could be pointing at high surrogate now.
continue;
return 1;
}

// we come here only if we have valid full surrogates
SurrogateCasing.ToUpper(a, charA, out char h1, out char l1);
SurrogateCasing.ToUpper(b, charB, out char h2, out char l2);
//
// charA and charB are surrogates
//

Debug.Assert(lowSurrogateA != '\0');
Debug.Assert(lowSurrogateB != '\0');

if (h1 != h2)
if (a == b && lowSurrogateA == lowSurrogateB)
{
return (int)h1 - (int)h2;
index += 2;
charA = ref Unsafe.Add(ref charA, 2);
charB = ref Unsafe.Add(ref charB, 2);
continue;
}

if (l1 != l2)
uint upperSurrogateA = CharUnicodeInfo.ToUpper(UnicodeUtility.GetScalarFromUtf16SurrogatePair(a, lowSurrogateA));
uint upperSurrogateB = CharUnicodeInfo.ToUpper(UnicodeUtility.GetScalarFromUtf16SurrogatePair(b, lowSurrogateB));

if (upperSurrogateA == upperSurrogateB)
{
return (int)l1 - (int)l2;
index += 2;
charA = ref Unsafe.Add(ref charA, 2);
charB = ref Unsafe.Add(ref charB, 2);
continue;
}

length--;
charA = ref Unsafe.Add(ref charA, 1);
charB = ref Unsafe.Add(ref charB, 1);
return (int)upperSurrogateA - (int)upperSurrogateB;
}

return lengthA - lengthB;
Expand Down

0 comments on commit 1d065b6

Please sign in to comment.