Skip to content

Commit 252034f

Browse files
Copilotstephentoubdanmoseley
authored
Fix escaped pattern display in GeneratedRegex XML documentation (#120098)
* Initial plan * Fix escaped pattern in GeneratedRegex summary - remove double escaping Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com> * Update test expectations to match fixed single-escaped pattern behavior Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com> * Fix XML parsing errors by properly escaping invalid XML characters in pattern documentation Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com> * Consolidate XML escaping logic into existing EscapeXmlComment method Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com> * Fix C1 control character escaping to resolve all test failures Co-authored-by: danmoseley <6385855+danmoseley@users.noreply.github.com> * Simplify EscapeXmlComment --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com> Co-authored-by: danmoseley <6385855+danmoseley@users.noreply.github.com> Co-authored-by: Stephen Toub <stoub@microsoft.com>
1 parent 98f102a commit 252034f

File tree

2 files changed

+93
-6
lines changed

2 files changed

+93
-6
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
using System.Runtime.CompilerServices;
1414
using System.Security.Cryptography;
1515
using System.Threading;
16+
using System.Text;
1617
using Microsoft.CodeAnalysis;
1718
using Microsoft.CodeAnalysis.CSharp;
1819

@@ -26,9 +27,34 @@ namespace System.Text.RegularExpressions.Generator
2627
{
2728
public partial class RegexGenerator
2829
{
29-
/// <summary>Escapes '&amp;', '&lt;' and '&gt;' characters. We aren't using HtmlEncode as that would also escape single and double quotes.</summary>
30-
private static string EscapeXmlComment(string text) =>
31-
text.Replace("&", "&amp;").Replace("<", "&lt;").Replace(">", "&gt;");
30+
/// <summary>Escapes characters that are invalid in XML comments.</summary>
31+
private static string EscapeXmlComment(string text)
32+
{
33+
if (!string.IsNullOrEmpty(text))
34+
{
35+
StringBuilder sb = new(text.Length);
36+
foreach (char c in text)
37+
{
38+
switch ((int)c)
39+
{
40+
// Escape XML entities.
41+
case '&': sb.Append("&amp;"); break;
42+
case '<': sb.Append("&lt;"); break;
43+
case '>': sb.Append("&gt;"); break;
44+
45+
// Propagate all other valid XML characters as-is. Control chars are considered invalid.
46+
case (>= 0x20 and <= 0x7F) or (>= 0xA0 and <= 0xD7FF) or (>= 0xE000 and <= 0xFFFD): sb.Append(c); break;
47+
48+
// Use Unicode escape sequences for everything else.
49+
default: sb.Append($"\\u{(int)c:X4}"); break;
50+
}
51+
}
52+
53+
text = sb.ToString();
54+
}
55+
56+
return text;
57+
}
3258

3359
/// <summary>Emits the definition of the partial method. This method just delegates to the property cache on the generated Regex-derived type.</summary>
3460
private static void EmitRegexPartialMethod(RegexMethod regexMethod, IndentedTextWriter writer)
@@ -59,7 +85,7 @@ private static void EmitRegexPartialMethod(RegexMethod regexMethod, IndentedText
5985
// Emit the partial method definition.
6086
writer.WriteLine($"/// <remarks>");
6187
writer.WriteLine($"/// Pattern:<br/>");
62-
writer.WriteLine($"/// <code>{EscapeXmlComment(Literal(regexMethod.Pattern, quote: false))}</code><br/>");
88+
writer.WriteLine($"/// <code>{EscapeXmlComment(regexMethod.Pattern)}</code><br/>");
6389
if (regexMethod.Options != RegexOptions.None)
6490
{
6591
writer.WriteLine($"/// Options:<br/>");

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ partial class C
8080
{
8181
/// <remarks>
8282
/// Pattern:<br/>
83-
/// <code>^(?&lt;proto&gt;\\w+)://[^/]+?(?&lt;port&gt;:\\d+)?/</code><br/>
83+
/// <code>^(?&lt;proto&gt;\w+)://[^/]+?(?&lt;port&gt;:\d+)?/</code><br/>
8484
/// Explanation:<br/>
8585
/// <code>
8686
/// ○ Match if at the beginning of the string.<br/>
@@ -482,7 +482,7 @@ partial class C
482482
{
483483
/// <remarks>
484484
/// Pattern:<br/>
485-
/// <code>href\\s*=\\s*(?:["'](?&lt;1&gt;[^"']*)["']|(?&lt;1&gt;[^&gt;\\s]+))</code><br/>
485+
/// <code>href\s*=\s*(?:["'](?&lt;1&gt;[^"']*)["']|(?&lt;1&gt;[^&gt;\s]+))</code><br/>
486486
/// Explanation:<br/>
487487
/// <code>
488488
/// ○ Match the string "href".<br/>
@@ -1132,5 +1132,66 @@ file static class Utilities
11321132
"""
11331133
};
11341134
}
1135+
1136+
[Fact]
1137+
public async Task Pattern_Should_Not_Be_Double_Escaped_In_Documentation()
1138+
{
1139+
string program = """
1140+
using System.Text.RegularExpressions;
1141+
partial class C
1142+
{
1143+
[GeneratedRegex(@"\.")]
1144+
public static partial Regex DotPattern();
1145+
}
1146+
""";
1147+
1148+
string actual = await RegexGeneratorHelper.GenerateSourceText(program, allowUnsafe: true, checkOverflow: false);
1149+
1150+
// The pattern should show \. (single backslash) not \\. (double backslash) in the documentation
1151+
Assert.Contains("/// <code>\\.</code><br/>", actual);
1152+
Assert.DoesNotContain("/// <code>\\\\.</code><br/>", actual);
1153+
}
1154+
1155+
[Fact]
1156+
public async Task Pattern_With_Control_Characters_Should_Be_Escaped_For_XML()
1157+
{
1158+
string program = """
1159+
using System.Text.RegularExpressions;
1160+
partial class C
1161+
{
1162+
[GeneratedRegex("a\0b")]
1163+
public static partial Regex NullCharPattern();
1164+
}
1165+
""";
1166+
1167+
string actual = await RegexGeneratorHelper.GenerateSourceText(program, allowUnsafe: true, checkOverflow: false);
1168+
1169+
// The pattern should escape null characters as Unicode escape sequences for XML safety in the documentation
1170+
Assert.Contains("/// <code>a\\u0000b</code><br/>", actual);
1171+
1172+
// The actual pattern string (base.pattern assignment) should properly escape the null character for C#
1173+
Assert.Contains("base.pattern = \"a\\0b\";", actual);
1174+
}
1175+
1176+
[Fact]
1177+
public async Task Pattern_With_Newline_Should_Be_Escaped_For_XML()
1178+
{
1179+
string program = """
1180+
using System.Text.RegularExpressions;
1181+
partial class C
1182+
{
1183+
[GeneratedRegex("\n")]
1184+
public static partial Regex NewlinePattern();
1185+
}
1186+
""";
1187+
1188+
string actual = await RegexGeneratorHelper.GenerateSourceText(program, allowUnsafe: true, checkOverflow: false);
1189+
1190+
// The pattern should escape newline as Unicode escape sequence to avoid breaking XML comments
1191+
Assert.Contains("/// <code>\\u000A</code><br/>", actual);
1192+
1193+
// The actual pattern string should properly escape the newline for C#
1194+
Assert.Contains("base.pattern = \"\\n\";", actual);
1195+
}
11351196
}
11361197
}

0 commit comments

Comments
 (0)