dotnet · stephentoub · Apr 21, 2024 · Apr 12, 2024 · Apr 19, 2024 · Apr 19, 2024
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/Hashing.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/Hashing.cs
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using System.Buffers;
+using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.InteropServices;
 
@@ -76,6 +77,8 @@ public static unsafe int GetHashCodeOrdinal(ReadOnlySpan<char> s)
         // useful if the string only contains ASCII characters
         public static unsafe int GetHashCodeOrdinalIgnoreCaseAscii(ReadOnlySpan<char> s)
         {
+            Debug.Assert(KeyAnalyzer.IsAllAscii(s));
+
             // We "normalize to lowercase" every char by ORing with 0x20. This casts
             // a very wide net because it will change, e.g., '^' to '~'. But that should
             // be ok because we expect this to be very rare in practice.
@@ -138,14 +141,17 @@ public static unsafe int GetHashCodeOrdinalIgnoreCaseAscii(ReadOnlySpan<char> s)
 
         public static unsafe int GetHashCodeOrdinalIgnoreCase(ReadOnlySpan<char> s)
         {
+#if NET
+            return string.GetHashCode(s, StringComparison.OrdinalIgnoreCase);
+#else
             int length = s.Length;
 
             char[]? rentedArray = null;
             Span<char> scratch = length <= 256 ?
                 stackalloc char[256] :
                 (rentedArray = ArrayPool<char>.Shared.Rent(length));
 
-            length = s.ToUpperInvariant(scratch); // NOTE: this really should be the (non-existent) ToUpperOrdinal
+            length = s.ToUpperInvariant(scratch); // NOTE: this both allocates and really should be the (non-existent) ToUpperOrdinal
             int hash = GetHashCodeOrdinal(scratch.Slice(0, length));
 
             if (rentedArray is not null)
@@ -154,6 +160,7 @@ public static unsafe int GetHashCodeOrdinalIgnoreCase(ReadOnlySpan<char> s)
             }
 
             return hash;
+#endif
         }
     }
 }
diff --git a/...ibraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs b/...ibraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs
@@ -32,31 +32,40 @@ public static AnalysisResults Analyze(
             ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength)
         {
             Debug.Assert(!uniqueStrings.IsEmpty);
+            bool allUniqueStringsAreConfirmedAscii = ignoreCase && AreAllAscii(uniqueStrings);
 
             // Try to pick a substring comparer. If we can't find a good substring comparer, fallback to a full string comparer.
             AnalysisResults results;
-            if (minLength == 0 || !TryUseSubstring(uniqueStrings, ignoreCase, minLength, maxLength, out results))
+            if (minLength == 0 || !TryUseSubstring(uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, out results))
             {
-                results = CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, 0, 0, static (s, _, _) => s.AsSpan());
+                results = CreateAnalysisResults(uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, 0, 0, static (s, _, _) => s.AsSpan());
             }
 
             return results;
         }
 
         /// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary>
-        private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
+        private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool allUniqueStringsAreConfirmedAscii, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
         {
             const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... it's not worth the increase in algorithmic complexity to analyze longer substrings
-            int uniqueStringsLength = uniqueStrings.Length;
 
             // Sufficient uniqueness factor of 95% is good enough.
             // Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
-            int acceptableNonUniqueCount = uniqueStringsLength / 20;
+            int acceptableNonUniqueCount = uniqueStrings.Length / 20;
+
+            // If we're case-sensitive, we can just use a case-sensitive substring comparer, which is cheap.
+            // For case-insensitive / ignoreCase, we don't know which portion of the input strings we're going to care about.
+            // However, if all of the strings are entirely ASCII, we know that no portion of any will be non-ASCII and thus can
+            // use a comparer that only knows how to do ASCII-based ordinal casing, which is cheaper than full Unicode casing,
+            // in particular for GetHashCode.
+            SubstringComparer comparer =
+                !ignoreCase ? new JustifiedSubstringComparer() :
+                allUniqueStringsAreConfirmedAscii ? new JustifiedCaseInsensitiveAsciiSubstringComparer() :
+                new JustifiedCaseInsensitiveSubstringComparer();
 
-            SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer();
             HashSet<string> set = new HashSet<string>(
 #if NET6_0_OR_GREATER
-                uniqueStringsLength,
+                uniqueStrings.Length,
 #endif
                 comparer);
 
@@ -77,7 +86,7 @@ private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ign
                     if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount))
                     {
                         results = CreateAnalysisResults(
-                            uniqueStrings, ignoreCase, minLength, maxLength, index, count,
+                            uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, index, count,
                             static (string s, int index, int count) => s.AsSpan(index, count));
                         return true;
                     }
@@ -101,7 +110,7 @@ private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ign
                         if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount))
                         {
                             results = CreateAnalysisResults(
-                                uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count,
+                                uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, comparer.Index, count,
                                 static (string s, int index, int count) => s.AsSpan(s.Length + index, count));
                             return true;
                         }
@@ -115,7 +124,7 @@ private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ign
         }
 
         private static AnalysisResults CreateAnalysisResults(
-            ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, int index, int count, GetSpan getHashString)
+            ReadOnlySpan<string> uniqueStrings, bool allUniqueStringsAreConfirmedAscii, bool ignoreCase, int minLength, int maxLength, int index, int count, GetSpan getHashString)
         {
             // Start off by assuming all strings are ASCII
             bool allAsciiIfIgnoreCase = true;
@@ -134,10 +143,9 @@ private static AnalysisResults CreateAnalysisResults(
                 foreach (string uniqueString in uniqueStrings)
                 {
                     // Get a span representing the slice of the uniqueString which will be hashed.
-                    ReadOnlySpan<char> hashString = getHashString(uniqueString, index, count);
-
                     // If the slice isn't ASCII, bail out to return the results.
-                    if (!IsAllAscii(hashString))
+                    if (!allUniqueStringsAreConfirmedAscii &&
+                        !IsAllAscii(getHashString(uniqueString, index, count)))
                     {
                         allAsciiIfIgnoreCase = false;
                         canSwitchIgnoreCaseHashToCaseSensitive = false;
@@ -153,13 +161,14 @@ private static AnalysisResults CreateAnalysisResults(
                         // and as we have just checked that IsAllAscii(hashString) is true
                         // then we know IsAllAscii(uniqueString) must be true,
                         // so we can skip the check.
-                        if (count > 0 && !IsAllAscii(uniqueString.AsSpan()))
-                        {
-                            canSwitchIgnoreCaseHashToCaseSensitive = false;
-                        }
-                        else if (ContainsAnyAsciiLetters(uniqueString.AsSpan()))
+                        if ((count > 0 && !allUniqueStringsAreConfirmedAscii && !IsAllAscii(uniqueString.AsSpan())) ||
+                            ContainsAnyAsciiLetters(uniqueString.AsSpan()))
                         {
                             canSwitchIgnoreCaseHashToCaseSensitive = false;
+                            if (allUniqueStringsAreConfirmedAscii)
+                            {
+                                break;
+                            }
                         }
                     }
                 }
@@ -177,6 +186,19 @@ private static AnalysisResults CreateAnalysisResults(
 
         private delegate ReadOnlySpan<char> GetSpan(string s, int index, int count);
 
+        private static bool AreAllAscii(ReadOnlySpan<string> strings)
+        {
+            foreach (string s in strings)
+            {
+                if (!IsAllAscii(s.AsSpan()))
+                {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+
         internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
         {
 #if NET8_0_OR_GREATER
@@ -296,5 +318,11 @@ private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringCompar
             public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase);
             public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
         }
+
+        private sealed class JustifiedCaseInsensitiveAsciiSubstringComparer : SubstringComparer
+        {
+            public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase);
+            public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCaseAscii(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
+        }
     }
 }
diff --git a/src/libraries/System.Collections.Immutable/tests/Frozen/FrozenDictionaryTests.cs b/src/libraries/System.Collections.Immutable/tests/Frozen/FrozenDictionaryTests.cs
@@ -361,6 +361,35 @@ protected override string CreateTKey(int seed)
         }
 
         protected override string CreateTValue(int seed) => CreateTKey(seed);
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(25)]
+        [InlineData(50)]
+        [InlineData(75)]
+        [InlineData(100)]
+        public void ContainsKey_WithNonAscii(int percentageWithNonAscii)
+        {
+            Random rand = new(42);
+
+            Dictionary<string, string> expected = new(GetKeyIEqualityComparer());
+            for (int i = 0; i < 100; i++)
+            {
+                int stringLength = rand.Next(4, 30);
+                byte[] bytes = new byte[stringLength];
+                rand.NextBytes(bytes);
+                string value = Convert.ToBase64String(bytes);
+                if (rand.Next(100) < percentageWithNonAscii)
+                {
+                    value = value.Replace(value[rand.Next(value.Length)], '\u05D0');
+                }
+                expected.Add(value, value);
+            }
+
+            FrozenDictionary<string, string> actual = expected.ToFrozenDictionary(GetKeyIEqualityComparer());
+
+            Assert.All(expected, kvp => actual.ContainsKey(kvp.Key));
+        }
     }
 
     public class FrozenDictionary_Generic_Tests_string_string_Default : FrozenDictionary_Generic_Tests_string_string

diff --git a/src/libraries/System.Collections.Immutable/tests/Frozen/FrozenSetTests.cs b/src/libraries/System.Collections.Immutable/tests/Frozen/FrozenSetTests.cs
@@ -6,7 +6,6 @@
 using System.Globalization;
 using System.Linq;
 using Xunit;
-using System.Numerics;
 using System.Collections.Immutable;
 using System.Diagnostics.CodeAnalysis;
 
@@ -338,6 +337,35 @@ protected override string CreateT(int seed)
             rand.NextBytes(bytes1);
             return Convert.ToBase64String(bytes1);
         }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(25)]
+        [InlineData(50)]
+        [InlineData(75)]
+        [InlineData(100)]
+        public void Contains_WithNonAscii(int percentageWithNonAscii)
+        {
+            Random rand = new(42);
+
+            HashSet<string> expected = new(GetIEqualityComparer());
+            for (int i = 0; i < 100; i++)
+            {
+                int stringLength = rand.Next(4, 30);
+                byte[] bytes = new byte[stringLength];
+                rand.NextBytes(bytes);
+                string value = Convert.ToBase64String(bytes);
+                if (rand.Next(100) < percentageWithNonAscii)
+                {
+                    value = value.Replace(value[rand.Next(value.Length)], '\u05D0');
+                }
+                expected.Add(value);
+            }
+
+            FrozenSet<string> actual = expected.ToFrozenSet(GetIEqualityComparer());
+
+            Assert.All(expected, s => actual.Contains(s));
+        }
     }
 
     public class FrozenSet_Generic_Tests_string_Default : FrozenSet_Generic_Tests_string