Skip to content

Commit

Permalink
Add tighter bound to range check for matching Regex char classes (#67133
Browse files Browse the repository at this point in the history
)

When we emit a bitmap lookup for character classes containing only ASCII characters, we currently bound the check by 128, e.g.
```C#
if (ch < 128 && lookupTable[...])
```
but we can easily lower that 128 to instead be the actual exclusive upper bound based on the char set.  Doing so means we don't need to hit the lookup table for a larger set of characters.

(We could also actually shrink the size of the lookup table itself, but doing so would only save a few bytes, and it didn't seem worth the complexity right now.  We could also add a lower range check, but that's also additional checks to execute whereas this one is just improving an existing check that's also required for correctness.)
  • Loading branch information
stephentoub authored Mar 25, 2022
1 parent 6a889d2 commit 55e012e
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4006,8 +4006,8 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
// character class were [A-Za-z0-9], so since the ch is now known to be >= 128, we
// can just fail the comparison.
return negate ?
$"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" :
$"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)";
$"((ch = {chExpr}) >= {Literal((char)analysis.UpperBoundExclusiveIfContainsOnlyAscii)} || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" :
$"((ch = {chExpr}) < {Literal((char)analysis.UpperBoundExclusiveIfContainsOnlyAscii)} && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)";
}

if (analysis.AllNonAsciiContained)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -921,6 +921,8 @@ internal struct CharClassAnalysisResults
public bool AllAsciiContained;
/// <summary>true if we know for sure that all non-ASCII values are in the set; otherwise, false.</summary>
public bool AllNonAsciiContained;
/// <summary>The exclusive upper bound. Only valid if <see cref="ContainsOnlyAscii"/> is true.</summary>
public int UpperBoundExclusiveIfContainsOnlyAscii;
}

/// <summary>Analyzes the set to determine some basic properties that can be used to optimize usage.</summary>
Expand Down Expand Up @@ -962,6 +964,7 @@ internal static CharClassAnalysisResults Analyze(string set)
AllAsciiContained = false,
ContainsOnlyAscii = set[set.Length - 1] <= 128,
ContainsNoAscii = set[SetStartIndex] >= 128,
UpperBoundExclusiveIfContainsOnlyAscii = set[set.Length - 1],
};
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5020,7 +5020,7 @@ void EmitCharInClass()

// ch < 128 ? (bitVectorString[ch >> 4] & (1 << (ch & 0xF))) != 0 :
Ldloc(tempLocal);
Ldc(128);
Ldc(analysis.ContainsOnlyAscii ? analysis.UpperBoundExclusiveIfContainsOnlyAscii : 128);
Bge(comparisonLabel);
Ldstr(bitVectorString);
Ldloc(tempLocal);
Expand Down

0 comments on commit 55e012e

Please sign in to comment.