Skip to content

Commit

Permalink
Use IndexOf for bounded loops in a regex (#101899)
Browse files Browse the repository at this point in the history
Today with a pattern like `ab*c`, IndexOfAnyExcept('b') will be used to skip past the `b`s. But if that pattern is changed to `ab{0, 1000}c`, we'll end up manually iterating, as the current specialization only handles unbounded loops. This adds the minor improvements necessary to also enable using IndexOf for bounded loops.
  • Loading branch information
stephentoub committed May 6, 2024
1 parent 3fb5386 commit 08e0f89
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4136,22 +4136,31 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired =
TransferSliceStaticPosToPos();
writer.WriteLine($"int {iterationLocal} = inputSpan.Length - pos;");
}
else if (maxIterations == int.MaxValue && TryEmitIndexOf(requiredHelpers, node, useLast: false, negate: true, out _, out string? indexOfExpr))
else if (TryEmitIndexOf(requiredHelpers, node, useLast: false, negate: true, out _, out string? indexOfExpr))
{
// We're unbounded and we can use an IndexOf method to perform the search. The unbounded restriction is
// purely for simplicity; it could be removed in the future with additional code to handle that case.
// We can use an IndexOf method to perform the search. If the number of iterations is unbounded, we can just search the whole span.
// If, however, it's bounded, we need to slice the span to the min(remainingSpan.Length, maxIterations) so that we don't
// search more than is necessary.

// If maxIterations is 0, the node should have been optimized away. If it's 1 and min is 0, it should
// have been handled as an optional loop above, and if it's 1 and min is 1, it should have been transformed
// into a single char match. So, we should only be here if maxIterations is greater than 1. And that's relevant,
// because we wouldn't want to invest in an IndexOf call if we're only going to iterate once.
Debug.Assert(maxIterations > 1);

TransferSliceStaticPosToPos();

writer.Write($"int {iterationLocal} = {sliceSpan}");
if (sliceStaticPos != 0)
if (maxIterations != int.MaxValue)
{
writer.Write($".Slice({sliceStaticPos})");
writer.Write($".Slice(0, Math.Min({sliceSpan}.Length, {maxIterations}))");
}
writer.WriteLine($".{indexOfExpr};");

using (EmitBlock(writer, $"if ({iterationLocal} < 0)"))
{
writer.WriteLine(sliceStaticPos > 0 ?
$"{iterationLocal} = {sliceSpan}.Length - {sliceStaticPos};" :
writer.WriteLine(maxIterations != int.MaxValue ?
$"{iterationLocal} = Math.Min({sliceSpan}.Length, {maxIterations});" :
$"{iterationLocal} = {sliceSpan}.Length;");
}
writer.WriteLine();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4457,17 +4457,25 @@ void EmitSingleCharAtomicLoop(RegexNode node)
Sub();
Stloc(iterationLocal);
}
else if (maxIterations == int.MaxValue && CanEmitIndexOf(node, out _))
else if (maxIterations > 1 && CanEmitIndexOf(node, out _))
{
// We're unbounded and we can use an IndexOf method to perform the search. The unbounded restriction is
// purely for simplicity; it could be removed in the future with additional code to handle that case.
// We can use an IndexOf method to perform the search. If the number of iterations is unbounded, we can just search the whole span.
// If, however, it's bounded, we need to slice the span to the min(remainingSpan.Length, maxIterations) so that we don't
// search more than is necessary. (There's little point in using IndexOf for an optional / something with at most one iteration,
// so we also skip using IndexOf in that case.)

// int i = slice.Slice(sliceStaticPos).IndexOf(...);
if (sliceStaticPos > 0)
TransferSliceStaticPosToPos();

// int i = slice.Slice(0, Math.Min(maxIterations, slice.Length)).IndexOf(...);
if (maxIterations != int.MaxValue)
{
Ldloca(slice);
Ldc(sliceStaticPos);
Call(s_spanSliceIntMethod);
Ldc(0);
Ldc(maxIterations);
Ldloca(slice);
Call(s_spanGetLengthMethod);
Call(s_mathMinIntInt);
Call(s_spanSliceIntIntMethod);
}
else
{
Expand All @@ -4482,13 +4490,13 @@ void EmitSingleCharAtomicLoop(RegexNode node)
Ldc(0);
BgeFar(atomicLoopDoneLabel);

// i = slice.Length - sliceStaticPos;
// i = Math.Min(slice.Length, maxIterations);
Ldloca(slice);
Call(s_spanGetLengthMethod);
if (sliceStaticPos > 0)
if (maxIterations != int.MaxValue)
{
Ldc(sliceStaticPos);
Sub();
Ldc(maxIterations);
Call(s_mathMinIntInt);
}
Stloc(iterationLocal);
}
Expand Down

0 comments on commit 08e0f89

Please sign in to comment.