Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce frozen collection creation overheads for ignore-case with ASCII values #100998

Merged
merged 3 commits into from
Apr 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System.Buffers;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.InteropServices;

Expand Down Expand Up @@ -76,6 +77,8 @@ public static unsafe int GetHashCodeOrdinal(ReadOnlySpan<char> s)
// useful if the string only contains ASCII characters
public static unsafe int GetHashCodeOrdinalIgnoreCaseAscii(ReadOnlySpan<char> s)
{
Debug.Assert(KeyAnalyzer.IsAllAscii(s));

// We "normalize to lowercase" every char by ORing with 0x20. This casts
// a very wide net because it will change, e.g., '^' to '~'. But that should
// be ok because we expect this to be very rare in practice.
Expand Down Expand Up @@ -138,14 +141,17 @@ public static unsafe int GetHashCodeOrdinalIgnoreCaseAscii(ReadOnlySpan<char> s)

public static unsafe int GetHashCodeOrdinalIgnoreCase(ReadOnlySpan<char> s)
{
#if NET
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
return string.GetHashCode(s, StringComparison.OrdinalIgnoreCase);
#else
int length = s.Length;

char[]? rentedArray = null;
Span<char> scratch = length <= 256 ?
stackalloc char[256] :
(rentedArray = ArrayPool<char>.Shared.Rent(length));

length = s.ToUpperInvariant(scratch); // NOTE: this really should be the (non-existent) ToUpperOrdinal
length = s.ToUpperInvariant(scratch); // NOTE: this both allocates and really should be the (non-existent) ToUpperOrdinal
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
int hash = GetHashCodeOrdinal(scratch.Slice(0, length));

if (rentedArray is not null)
Expand All @@ -154,6 +160,7 @@ public static unsafe int GetHashCodeOrdinalIgnoreCase(ReadOnlySpan<char> s)
}

return hash;
#endif
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,31 +32,40 @@ public static AnalysisResults Analyze(
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength)
{
Debug.Assert(!uniqueStrings.IsEmpty);
bool allUniqueStringsAreConfirmedAscii = ignoreCase && AreAllAscii(uniqueStrings);
stephentoub marked this conversation as resolved.
Show resolved Hide resolved

// Try to pick a substring comparer. If we can't find a good substring comparer, fallback to a full string comparer.
AnalysisResults results;
if (minLength == 0 || !TryUseSubstring(uniqueStrings, ignoreCase, minLength, maxLength, out results))
if (minLength == 0 || !TryUseSubstring(uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, out results))
{
results = CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, 0, 0, static (s, _, _) => s.AsSpan());
results = CreateAnalysisResults(uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, 0, 0, static (s, _, _) => s.AsSpan());
}

return results;
}

/// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary>
private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool allUniqueStringsAreConfirmedAscii, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
{
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... it's not worth the increase in algorithmic complexity to analyze longer substrings
int uniqueStringsLength = uniqueStrings.Length;

// Sufficient uniqueness factor of 95% is good enough.
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
int acceptableNonUniqueCount = uniqueStringsLength / 20;
int acceptableNonUniqueCount = uniqueStrings.Length / 20;

// If we're case-sensitive, we can just use a case-sensitive substring comparer, which is cheap.
// For case-insensitive / ignoreCase, we don't know which portion of the input strings we're going to care about.
// However, if all of the strings are entirely ASCII, we know that no portion of any will be non-ASCII and thus can
// use a comparer that only knows how to do ASCII-based ordinal casing, which is cheaper than full Unicode casing,
// in particular for GetHashCode.
SubstringComparer comparer =
!ignoreCase ? new JustifiedSubstringComparer() :
allUniqueStringsAreConfirmedAscii ? new JustifiedCaseInsensitiveAsciiSubstringComparer() :
new JustifiedCaseInsensitiveSubstringComparer();

SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer();
HashSet<string> set = new HashSet<string>(
#if NET6_0_OR_GREATER
uniqueStringsLength,
uniqueStrings.Length,
#endif
comparer);

Expand All @@ -77,7 +86,7 @@ private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ign
if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount))
{
results = CreateAnalysisResults(
uniqueStrings, ignoreCase, minLength, maxLength, index, count,
uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, index, count,
static (string s, int index, int count) => s.AsSpan(index, count));
return true;
}
Expand All @@ -101,7 +110,7 @@ private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ign
if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount))
{
results = CreateAnalysisResults(
uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count,
uniqueStrings, allUniqueStringsAreConfirmedAscii, ignoreCase, minLength, maxLength, comparer.Index, count,
static (string s, int index, int count) => s.AsSpan(s.Length + index, count));
return true;
}
Expand All @@ -115,7 +124,7 @@ private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ign
}

private static AnalysisResults CreateAnalysisResults(
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, int index, int count, GetSpan getHashString)
ReadOnlySpan<string> uniqueStrings, bool allUniqueStringsAreConfirmedAscii, bool ignoreCase, int minLength, int maxLength, int index, int count, GetSpan getHashString)
{
// Start off by assuming all strings are ASCII
bool allAsciiIfIgnoreCase = true;
Expand All @@ -134,10 +143,9 @@ private static AnalysisResults CreateAnalysisResults(
foreach (string uniqueString in uniqueStrings)
{
// Get a span representing the slice of the uniqueString which will be hashed.
ReadOnlySpan<char> hashString = getHashString(uniqueString, index, count);

// If the slice isn't ASCII, bail out to return the results.
if (!IsAllAscii(hashString))
if (!allUniqueStringsAreConfirmedAscii &&
!IsAllAscii(getHashString(uniqueString, index, count)))
{
allAsciiIfIgnoreCase = false;
canSwitchIgnoreCaseHashToCaseSensitive = false;
Expand All @@ -153,13 +161,14 @@ private static AnalysisResults CreateAnalysisResults(
// and as we have just checked that IsAllAscii(hashString) is true
// then we know IsAllAscii(uniqueString) must be true,
// so we can skip the check.
if (count > 0 && !IsAllAscii(uniqueString.AsSpan()))
{
canSwitchIgnoreCaseHashToCaseSensitive = false;
}
else if (ContainsAnyAsciiLetters(uniqueString.AsSpan()))
if ((count > 0 && !allUniqueStringsAreConfirmedAscii && !IsAllAscii(uniqueString.AsSpan())) ||
ContainsAnyAsciiLetters(uniqueString.AsSpan()))
{
canSwitchIgnoreCaseHashToCaseSensitive = false;
if (allUniqueStringsAreConfirmedAscii)
{
break;
}
}
}
}
Expand All @@ -177,6 +186,19 @@ private static AnalysisResults CreateAnalysisResults(

private delegate ReadOnlySpan<char> GetSpan(string s, int index, int count);

private static bool AreAllAscii(ReadOnlySpan<string> strings)
{
foreach (string s in strings)
{
if (!IsAllAscii(s.AsSpan()))
{
return false;
}
}

return true;
}

internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
{
#if NET8_0_OR_GREATER
Expand Down Expand Up @@ -296,5 +318,11 @@ private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringCompar
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase);
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
}

private sealed class JustifiedCaseInsensitiveAsciiSubstringComparer : SubstringComparer
{
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase);
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCaseAscii(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,35 @@ protected override string CreateTKey(int seed)
}

protected override string CreateTValue(int seed) => CreateTKey(seed);

[Theory]
[InlineData(0)]
[InlineData(25)]
[InlineData(50)]
[InlineData(75)]
[InlineData(100)]
public void ContainsKey_WithNonAscii(int percentageWithNonAscii)
{
Random rand = new(42);

Dictionary<string, string> expected = new(GetKeyIEqualityComparer());
for (int i = 0; i < 100; i++)
{
int stringLength = rand.Next(4, 30);
byte[] bytes = new byte[stringLength];
rand.NextBytes(bytes);
string value = Convert.ToBase64String(bytes);
if (rand.Next(100) < percentageWithNonAscii)
{
value = value.Replace(value[rand.Next(value.Length)], '\u05D0');
}
expected.Add(value, value);
}

FrozenDictionary<string, string> actual = expected.ToFrozenDictionary(GetKeyIEqualityComparer());

Assert.All(expected, kvp => actual.ContainsKey(kvp.Key));
}
}

public class FrozenDictionary_Generic_Tests_string_string_Default : FrozenDictionary_Generic_Tests_string_string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
using System.Globalization;
using System.Linq;
using Xunit;
using System.Numerics;
using System.Collections.Immutable;
using System.Diagnostics.CodeAnalysis;

Expand Down Expand Up @@ -338,6 +337,35 @@ protected override string CreateT(int seed)
rand.NextBytes(bytes1);
return Convert.ToBase64String(bytes1);
}

[Theory]
[InlineData(0)]
[InlineData(25)]
[InlineData(50)]
[InlineData(75)]
[InlineData(100)]
public void Contains_WithNonAscii(int percentageWithNonAscii)
{
Random rand = new(42);

HashSet<string> expected = new(GetIEqualityComparer());
for (int i = 0; i < 100; i++)
{
int stringLength = rand.Next(4, 30);
byte[] bytes = new byte[stringLength];
rand.NextBytes(bytes);
string value = Convert.ToBase64String(bytes);
if (rand.Next(100) < percentageWithNonAscii)
{
value = value.Replace(value[rand.Next(value.Length)], '\u05D0');
}
expected.Add(value);
}

FrozenSet<string> actual = expected.ToFrozenSet(GetIEqualityComparer());

Assert.All(expected, s => actual.Contains(s));
}
}

public class FrozenSet_Generic_Tests_string_Default : FrozenSet_Generic_Tests_string
Expand Down
Loading