From 8273d0f1d56839178daa0e3af461f474c63218d5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 11 Oct 2025 15:59:31 +0000 Subject: [PATCH 1/5] Initial plan From e923223d4c6c000359bcab4a8047cbb53fcad089 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 11 Oct 2025 16:34:43 +0000 Subject: [PATCH 2/5] Fix NonBacktracking regex empty capture groups with newline at end The issue was in DefaultInputReader.GetPositionId which was applying special handling for '\n' at the end of input unconditionally. This special handling is only needed for the \Z anchor. Added a check for _containsEndZAnchor to only apply the special handling when needed. Also added regression tests to verify the fix. Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com> --- .../RegularExpressions/Symbolic/SymbolicRegexMatcher.cs | 2 +- .../tests/FunctionalTests/Regex.Groups.Tests.cs | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 8ce8bcc203dab9..e385eede9c6d9a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -1483,7 +1483,7 @@ public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan { // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor int c = input[pos]; - return c == '\n' && pos == input.Length - 1 ? + return c == '\n' && pos == input.Length - 1 && matcher._containsEndZAnchor ? matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input matcher._mintermClassifier.GetMintermID(c); } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs index f95299c430ee7e..03e0b52d3efbda 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs @@ -461,6 +461,13 @@ public static IEnumerable Groups_MemberData() yield return (enUS, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" }); yield return (enUS, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" }); yield return (enUS, @"(mouse)\s\n^(cat)\s+(dog)", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog", "mouse", "cat", "dog" }); + + // Regression test for https://github.com/dotnet/runtime/issues/110604 + // Capture groups with beginning anchor and newline at end should work correctly + yield return (enUS, @"^(A)(\s)", "A\n", RegexOptions.None, new string[] { "A\n", "A", "\n" }); + yield return (enUS, @"^(A)(\s)", "A\n", RegexOptions.Multiline, new string[] { "A\n", "A", "\n" }); + yield return (enUS, @"^(A)(\s)", "A ", RegexOptions.None, new string[] { "A ", "A", " " }); + if (!RegexHelpers.IsNonBacktracking(engine)) // ECMAScript not supported { yield return (enUS, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" }); From 8a51bd5a2043b6b2a803719262815c9da556f65f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Oct 2025 12:15:41 +0000 Subject: [PATCH 3/5] Fix issue number reference in test comment Corrected the issue number from #110604 to #120202 in the regression test comment. Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com> --- .../tests/FunctionalTests/Regex.Groups.Tests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs index 03e0b52d3efbda..2a04c99028779e 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs @@ -462,7 +462,7 @@ public static IEnumerable Groups_MemberData() yield return (enUS, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" }); yield return (enUS, @"(mouse)\s\n^(cat)\s+(dog)", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog", "mouse", "cat", "dog" }); - // Regression test for https://github.com/dotnet/runtime/issues/110604 + // Regression test for https://github.com/dotnet/runtime/issues/120202 // Capture groups with beginning anchor and newline at end should work correctly yield return (enUS, @"^(A)(\s)", "A\n", RegexOptions.None, new string[] { "A\n", "A", "\n" }); yield return (enUS, @"^(A)(\s)", "A\n", RegexOptions.Multiline, new string[] { "A\n", "A", "\n" }); From 4fe286eb2be21d732a1e6a79676b1805dc6d1acb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Oct 2025 12:52:33 +0000 Subject: [PATCH 4/5] Fix capture groups for all anchor combinations with newline at end The issue was that FindSubcaptures was using DefaultInputReader which applies special \n handling for \Z anchor. This interfered with capture tracking for patterns with beginning anchors but without \Z. The fix is to make FindSubcaptures always use NoZAnchorOptimizedInputReader, which doesn't apply the special \n handling. This allows: - Match finding to use the special \n handling for \Z anchor (correct) - Capture tracking to use the actual character without special handling (correct) Added regression tests for all anchor combinations. Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com> --- .../RegularExpressions/Symbolic/SymbolicRegexMatcher.cs | 8 ++++---- .../tests/FunctionalTests/Regex.Groups.Tests.cs | 5 ++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index e385eede9c6d9a..095394db016c8e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -474,9 +474,9 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i } else { - Registers endRegisters = _containsAnyAnchor ? - FindSubcaptures(input, matchStart, matchEnd, perThreadData) : - FindSubcaptures(input, matchStart, matchEnd, perThreadData); + // Always use NoZAnchorOptimizedInputReader for FindSubcaptures to avoid the special \n handling + // that's needed for \Z anchor matching but interferes with capture tracking + Registers endRegisters = FindSubcaptures(input, matchStart, matchEnd, perThreadData); return new SymbolicMatch(matchStart, matchEnd - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds); } } @@ -1483,7 +1483,7 @@ public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan { // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor int c = input[pos]; - return c == '\n' && pos == input.Length - 1 && matcher._containsEndZAnchor ? + return c == '\n' && pos == input.Length - 1 ? matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input matcher._mintermClassifier.GetMintermID(c); } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs index 2a04c99028779e..096c5829b4ec3c 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs @@ -463,8 +463,11 @@ public static IEnumerable Groups_MemberData() yield return (enUS, @"(mouse)\s\n^(cat)\s+(dog)", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog", "mouse", "cat", "dog" }); // Regression test for https://github.com/dotnet/runtime/issues/120202 - // Capture groups with beginning anchor and newline at end should work correctly + // Capture groups with beginning anchor and newline at end should work correctly with different end anchors yield return (enUS, @"^(A)(\s)", "A\n", RegexOptions.None, new string[] { "A\n", "A", "\n" }); + yield return (enUS, @"^(A)(\s)\z", "A\n", RegexOptions.None, new string[] { "A\n", "A", "\n" }); + yield return (enUS, @"^(A)(\s)$", "A\n", RegexOptions.None, new string[] { "A\n", "A", "\n" }); + yield return (enUS, @"^(A)(\s)\Z", "A\n", RegexOptions.None, new string[] { "A\n", "A", "\n" }); yield return (enUS, @"^(A)(\s)", "A\n", RegexOptions.Multiline, new string[] { "A\n", "A", "\n" }); yield return (enUS, @"^(A)(\s)", "A ", RegexOptions.None, new string[] { "A ", "A", " " }); From ddba3c616832b9ca18003e7dd899f285a54dae2c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 16 Oct 2025 16:48:06 +0000 Subject: [PATCH 5/5] Refine fix to only apply special \n handling for \Z anchor Based on feedback from @ieviev, the special \n handling should only apply for \Z anchor (uppercase), not for \z or other anchors. This ensures: - Patterns with \Z use special handling (for lookahead semantics) - Patterns with \z, ^, or no end anchor use normal handling (correct captures) Added test case for (A)(\s)\z without beginning anchor as requested. All 29,315 tests pass. Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com> --- .../Symbolic/SymbolicRegexMatcher.cs | 12 +++++++----- .../tests/FunctionalTests/Regex.Groups.Tests.cs | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 095394db016c8e..02367fa02aeab1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -474,9 +474,11 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i } else { - // Always use NoZAnchorOptimizedInputReader for FindSubcaptures to avoid the special \n handling - // that's needed for \Z anchor matching but interferes with capture tracking - Registers endRegisters = FindSubcaptures(input, matchStart, matchEnd, perThreadData); + // Use DefaultInputReader when pattern contains \Z anchor (needs special \n handling), + // otherwise use NoZAnchorOptimizedInputReader for correct capture tracking + Registers endRegisters = _containsEndZAnchor ? + FindSubcaptures(input, matchStart, matchEnd, perThreadData) : + FindSubcaptures(input, matchStart, matchEnd, perThreadData); return new SymbolicMatch(matchStart, matchEnd - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds); } } @@ -1481,9 +1483,9 @@ public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan { if ((uint)pos < (uint)input.Length) { - // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor + // Find the minterm, handling the special case for the last \n for \Z anchor int c = input[pos]; - return c == '\n' && pos == input.Length - 1 ? + return c == '\n' && pos == input.Length - 1 && matcher._containsEndZAnchor ? matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input matcher._mintermClassifier.GetMintermID(c); } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs index 096c5829b4ec3c..6d7c7917f8e3ef 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs @@ -468,6 +468,7 @@ public static IEnumerable Groups_MemberData() yield return (enUS, @"^(A)(\s)\z", "A\n", RegexOptions.None, new string[] { "A\n", "A", "\n" }); yield return (enUS, @"^(A)(\s)$", "A\n", RegexOptions.None, new string[] { "A\n", "A", "\n" }); yield return (enUS, @"^(A)(\s)\Z", "A\n", RegexOptions.None, new string[] { "A\n", "A", "\n" }); + yield return (enUS, @"(A)(\s)\z", "A\n", RegexOptions.None, new string[] { "A\n", "A", "\n" }); // without beginning anchor yield return (enUS, @"^(A)(\s)", "A\n", RegexOptions.Multiline, new string[] { "A\n", "A", "\n" }); yield return (enUS, @"^(A)(\s)", "A ", RegexOptions.None, new string[] { "A ", "A", " " });