-
Notifications
You must be signed in to change notification settings - Fork 4.9k
/
Copy pathRegexGenerator.cs
394 lines (349 loc) · 20.6 KB
/
RegexGenerator.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.CodeDom.Compiler;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.IO;
using System.Linq;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp;
using Microsoft.CodeAnalysis.CSharp.Syntax;
[assembly: System.Resources.NeutralResourcesLanguage("en-us")]
namespace System.Text.RegularExpressions.Generator
{
/// <summary>Generates C# source code to implement regular expressions.</summary>
[Generator(LanguageNames.CSharp)]
public partial class RegexGenerator : IIncrementalGenerator
{
/// <summary>Name of the type emitted to contain helpers used by the generated code.</summary>
private const string HelpersTypeName = "Utilities";
/// <summary>Namespace containing all the generated code.</summary>
private const string GeneratedNamespace = "System.Text.RegularExpressions.Generated";
/// <summary>Code for a [GeneratedCode] attribute to put on the top-level generated members.</summary>
private static readonly string s_generatedCodeAttribute = $"GeneratedCodeAttribute(\"{typeof(RegexGenerator).Assembly.GetName().Name}\", \"{typeof(RegexGenerator).Assembly.GetName().Version}\")";
/// <summary>Header comments and usings to include at the top of every generated file.</summary>
private static readonly string[] s_headers =
[
"// <auto-generated/>",
"#nullable enable",
"#pragma warning disable CS0162 // Unreachable code",
"#pragma warning disable CS0164 // Unreferenced label",
"#pragma warning disable CS0219 // Variable assigned but never used",
];
internal record struct CompilationData(bool AllowUnsafe, bool CheckOverflow, LanguageVersion LanguageVersion);
public void Initialize(IncrementalGeneratorInitializationContext context)
{
// Produces one entry per generated regex. This may be:
// - DiagnosticData in the case of a failure that should end the compilation
// - (RegexMethod regexMethod, string runnerFactoryImplementation, Dictionary<string, string[]> requiredHelpers) in the case of valid regex
// - (RegexMethod regexMethod, string reason, DiagnosticData diagnostic) in the case of a limited-support regex
IncrementalValueProvider<ImmutableArray<object>> results =
context.SyntaxProvider
// Find all MethodDeclarationSyntax nodes attributed with GeneratedRegex and gather the required information.
// The predicate will be run once for every attributed node in the same file that's being modified.
// The transform will be run once for every attributed node in the compilation.
// Thus, both should do the minimal amount of work required and get out. This should also have extracted
// everything from the target necessary to do all subsequent analysis and should return an object that's
// meaningfully comparable and that doesn't reference anything from the compilation: we want to ensure
// that any successful cached results are idempotent for the input such that they don't trigger downstream work
// if there are no changes.
.ForAttributeWithMetadataName(
GeneratedRegexAttributeName,
(node, _) => node is MethodDeclarationSyntax or PropertyDeclarationSyntax or IndexerDeclarationSyntax or AccessorDeclarationSyntax,
GetRegexMethodDataOrFailureDiagnostic)
// Filter out any parsing errors that resulted in null objects being returned.
.Where(static m => m is not null)
// The input here will either be a DiagnosticData (in the case of something erroneous detected in GetRegexMethodDataOrFailureDiagnostic)
// or it will be a RegexPatternAndSyntax containing all of the successfully parsed data from the attribute/method.
.Select((methodOrDiagnostic, _) =>
{
if (methodOrDiagnostic is RegexPatternAndSyntax method)
{
try
{
RegexTree regexTree = RegexParser.Parse(method.Pattern, method.Options | RegexOptions.Compiled, method.Culture); // make sure Compiled is included to get all optimizations applied to it
AnalysisResults analysis = RegexTreeAnalyzer.Analyze(regexTree);
return new RegexMethod(method.DeclaringType, method.IsProperty, method.DiagnosticLocation, method.MemberName, method.Modifiers, method.NullableRegex, method.Pattern, method.Options, method.MatchTimeout, regexTree, analysis, method.CompilationData);
}
catch (Exception e)
{
return new DiagnosticData(DiagnosticDescriptors.InvalidRegexArguments, method.DiagnosticLocation, e.Message);
}
}
return methodOrDiagnostic;
})
// Generate the RunnerFactory for each regex, if possible. This is where the bulk of the implementation occurs.
.Select((state, _) =>
{
if (state is not RegexMethod regexMethod)
{
Debug.Assert(state is DiagnosticData);
return state;
}
// If we're unable to generate a full implementation for this regex, report a diagnostic.
// We'll still output a limited implementation that just caches a new Regex(...).
if (!SupportsCodeGeneration(regexMethod, regexMethod.CompilationData.LanguageVersion, out string? reason))
{
return (regexMethod, reason, new DiagnosticData(DiagnosticDescriptors.LimitedSourceGeneration, regexMethod.DiagnosticLocation), regexMethod.CompilationData);
}
// Generate the core logic for the regex.
Dictionary<string, string[]> requiredHelpers = new();
var sw = new StringWriter();
var writer = new IndentedTextWriter(sw);
writer.Indent += 2;
writer.WriteLine();
EmitRegexDerivedTypeRunnerFactory(writer, regexMethod, requiredHelpers, regexMethod.CompilationData.CheckOverflow);
writer.Indent -= 2;
return (regexMethod, sw.ToString(), requiredHelpers, regexMethod.CompilationData);
})
// Combine all of the generated text outputs into a single batch. We then generate a single source output from that batch.
.Collect()
// Apply sequence equality comparison on the result array for incremental caching.
.WithComparer(new ObjectImmutableArraySequenceEqualityComparer());
// When there something to output, take all the generated strings and concatenate them to output,
// and raise all of the created diagnostics.
context.RegisterSourceOutput(results, static (context, results) =>
{
// Report any top-level diagnostics.
bool allFailures = true;
foreach (object result in results)
{
if (result is DiagnosticData d)
{
context.ReportDiagnostic(d.ToDiagnostic());
}
else
{
allFailures = false;
}
}
if (allFailures)
{
return;
}
// At this point we'll be emitting code. Create a writer to hold it all.
using StringWriter sw = new();
using IndentedTextWriter writer = new(sw);
// Add file headers and required usings.
foreach (string header in s_headers)
{
writer.WriteLine(header);
}
writer.WriteLine();
// For every generated type, we give it an incrementally increasing ID, in order to create
// unique type names even in situations where method names were the same, while also keeping
// the type names short. Note that this is why we only generate the RunnerFactory implementations
// earlier in the pipeline... we want to avoid generating code that relies on the class names
// until we're able to iterate through them linearly keeping track of a deterministic ID
// used to name them. The boilerplate code generation that happens here is minimal when compared to
// the work required to generate the actual matching code for the regex.
int id = 0;
// To minimize generated code in the event of duplicated regexes, we only emit one derived Regex type per unique
// expression/options/timeout. A Dictionary<(expression, options, timeout), RegexMethod> is used to deduplicate, where the value of the
// pair is the implementation used for the key.
var emittedExpressions = new Dictionary<(string Pattern, RegexOptions Options, int? Timeout), RegexMethod>();
// If we have any (RegexMethod regexMethod, string generatedName, string reason, DiagnosticData diagnostic), these are regexes for which we have
// limited support and need to simply output boilerplate. We need to emit their diagnostics.
// If we have any (RegexMethod regexMethod, string generatedName, string runnerFactoryImplementation, Dictionary<string, string[]> requiredHelpers),
// those are generated implementations to be emitted. We need to gather up their required helpers.
Dictionary<string, string[]> requiredHelpers = new();
foreach (object? result in results)
{
RegexMethod? regexMethod = null;
if (result is ValueTuple<RegexMethod, string, DiagnosticData, CompilationData> limitedSupportResult)
{
context.ReportDiagnostic(limitedSupportResult.Item3.ToDiagnostic());
regexMethod = limitedSupportResult.Item1;
}
else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>, CompilationData> regexImpl)
{
foreach (KeyValuePair<string, string[]> helper in regexImpl.Item3)
{
if (!requiredHelpers.ContainsKey(helper.Key))
{
requiredHelpers.Add(helper.Key, helper.Value);
}
}
regexMethod = regexImpl.Item1;
}
if (regexMethod is not null)
{
var key = (regexMethod.Pattern, regexMethod.Options, regexMethod.MatchTimeout);
if (emittedExpressions.TryGetValue(key, out RegexMethod? implementation))
{
regexMethod.IsDuplicate = true;
regexMethod.GeneratedName = implementation.GeneratedName;
}
else
{
regexMethod.IsDuplicate = false;
regexMethod.GeneratedName = $"{regexMethod.MemberName}_{id++}";
emittedExpressions.Add(key, regexMethod);
}
EmitRegexPartialMethod(regexMethod, writer);
writer.WriteLine();
}
}
// At this point we've emitted all the partial method definitions, but we still need to emit the actual regex-derived implementations.
// These are all emitted inside of our generated class.
writer.WriteLine($"namespace {GeneratedNamespace}");
writer.WriteLine($"{{");
// We emit usings here now that we're inside of a namespace block and are no longer emitting code into
// a user's partial type. We can now rely on binding rules mapping to these usings and don't need to
// use global-qualified names for the rest of the implementation.
writer.WriteLine($" using System;");
writer.WriteLine($" using System.Buffers;");
writer.WriteLine($" using System.CodeDom.Compiler;");
writer.WriteLine($" using System.Collections;");
writer.WriteLine($" using System.ComponentModel;");
writer.WriteLine($" using System.Globalization;");
writer.WriteLine($" using System.Runtime.CompilerServices;");
writer.WriteLine($" using System.Text.RegularExpressions;");
writer.WriteLine($" using System.Threading;");
writer.WriteLine($"");
// Emit each Regex-derived type.
writer.Indent++;
foreach (object? result in results)
{
if (result is ValueTuple<RegexMethod, string, DiagnosticData, CompilationData> limitedSupportResult)
{
if (!limitedSupportResult.Item1.IsDuplicate)
{
EmitRegexLimitedBoilerplate(writer, limitedSupportResult.Item1, limitedSupportResult.Item2, limitedSupportResult.Item4.LanguageVersion);
writer.WriteLine();
}
}
else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>, CompilationData> regexImpl)
{
if (!regexImpl.Item1.IsDuplicate)
{
EmitRegexDerivedImplementation(writer, regexImpl.Item1, regexImpl.Item2, regexImpl.Item4.AllowUnsafe);
writer.WriteLine();
}
}
}
writer.Indent--;
// If any of the Regex-derived types asked for helper methods, emit those now.
if (requiredHelpers.Count != 0)
{
writer.Indent++;
writer.WriteLine($"/// <summary>Helper methods used by generated <see cref=\"Regex\"/>-derived implementations.</summary>");
writer.WriteLine($"[{s_generatedCodeAttribute}]");
writer.WriteLine($"file static class {HelpersTypeName}");
writer.WriteLine($"{{");
writer.Indent++;
bool sawFirst = false;
foreach (KeyValuePair<string, string[]> helper in requiredHelpers.OrderBy(h => h.Key, StringComparer.Ordinal))
{
if (sawFirst)
{
writer.WriteLine();
}
sawFirst = true;
foreach (string value in helper.Value)
{
writer.WriteLine(value);
}
}
writer.Indent--;
writer.WriteLine($"}}");
writer.Indent--;
}
writer.WriteLine($"}}");
// Save out the source
context.AddSource("RegexGenerator.g.cs", sw.ToString());
});
}
/// <summary>Determines whether the passed in node supports C# code generation.</summary>
/// <remarks>
// It also provides a human-readable string to explain the reason. It will be emitted by the source generator
// as a comment into the C# code, hence there's no need to localize.
/// </remarks>
private static bool SupportsCodeGeneration(RegexMethod method, LanguageVersion languageVersion, [NotNullWhen(false)] out string? reason)
{
if (languageVersion < LanguageVersion.CSharp11)
{
reason = "the language version must be C# 11 or higher.";
return false;
}
RegexNode node = method.Tree.Root;
if (!node.SupportsCompilation(out reason))
{
// If the pattern doesn't support Compilation, then code generation won't be supported either.
return false;
}
if (HasCaseInsensitiveBackReferences(node))
{
// For case-insensitive patterns, we use our internal Regex case equivalence table when doing character comparisons.
// Most of the use of this table is done at Regex construction time by substituting all characters that are involved in
// case conversions into sets that contain all possible characters that could match. That said, there is still one case
// where you may need to do case-insensitive comparisons at match time which is the case for backreferences. For that reason,
// and given the Regex case equivalence table is internal and can't be called by the source generated emitted type, if
// the pattern contains case-insensitive backreferences, we won't try to create a source generated Regex-derived type.
reason = "the expression contains case-insensitive backreferences which are not supported by the source generator";
return false;
}
// If Compilation is supported and pattern doesn't have case insensitive backreferences, then code generation is supported.
reason = null;
return true;
static bool HasCaseInsensitiveBackReferences(RegexNode node)
{
if (node.Kind is RegexNodeKind.Backreference && (node.Options & RegexOptions.IgnoreCase) != 0)
{
return true;
}
int childCount = node.ChildCount();
for (int i = 0; i < childCount; i++)
{
// This recursion shouldn't hit issues with stack depth since this gets checked after
// SupportCompilation has ensured that the max depth is not greater than 40.
if (HasCaseInsensitiveBackReferences(node.Child(i)))
{
return true;
}
}
return false;
}
}
/// <summary>Stores the data necessary to create a Diagnostic.</summary>
/// <remarks>
/// Diagnostics do not have value equality semantics. Storing them in an object model
/// used in the pipeline can result in unnecessary recompilation.
/// </remarks>
private sealed record class DiagnosticData(DiagnosticDescriptor descriptor, Location location, object? arg = null)
{
/// <summary>Create a <see cref="Diagnostic"/> from the data.</summary>
public Diagnostic ToDiagnostic() => Diagnostic.Create(descriptor, location, arg is null ? [] : [arg]);
}
private sealed class ObjectImmutableArraySequenceEqualityComparer : IEqualityComparer<ImmutableArray<object>>
{
public bool Equals(ImmutableArray<object> left, ImmutableArray<object> right)
{
if (left.Length != right.Length)
{
return false;
}
for (int i = 0; i < left.Length; i++)
{
bool areEqual = left[i] is { } leftElem
? leftElem.Equals(right[i])
: right[i] is null;
if (!areEqual)
{
return false;
}
}
return true;
}
public int GetHashCode([DisallowNull] ImmutableArray<object> obj)
{
int hash = 0;
for (int i = 0; i < obj.Length; i++)
hash = (hash, obj[i]).GetHashCode();
return hash;
}
}
}
}