Skip to content

Commit dc3c2e5

Browse files
CyrusNajmabadiCopilotjjonescz
authored
Unify raw string lexing and parsing (#80817)
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: CyrusNajmabadi <4564579+CyrusNajmabadi@users.noreply.github.com> Co-authored-by: Jan Jones <jan.jones.cz@gmail.com>
1 parent e05c5bc commit dc3c2e5

File tree

7 files changed

+294
-407
lines changed

7 files changed

+294
-407
lines changed

src/Compilers/CSharp/Portable/Parser/LanguageParser.cs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11872,10 +11872,6 @@ ExpressionSyntax parsePrimaryExpressionWithoutPostfix(Precedence precedence)
1187211872
case SyntaxKind.NumericLiteralToken:
1187311873
case SyntaxKind.StringLiteralToken:
1187411874
case SyntaxKind.Utf8StringLiteralToken:
11875-
case SyntaxKind.SingleLineRawStringLiteralToken:
11876-
case SyntaxKind.Utf8SingleLineRawStringLiteralToken:
11877-
case SyntaxKind.MultiLineRawStringLiteralToken:
11878-
case SyntaxKind.Utf8MultiLineRawStringLiteralToken:
1187911875
case SyntaxKind.CharacterLiteralToken:
1188011876
return _syntaxFactory.LiteralExpression(SyntaxFacts.GetLiteralExpression(tk), this.EatToken());
1188111877
case SyntaxKind.InterpolatedStringStartToken:
@@ -11885,6 +11881,11 @@ ExpressionSyntax parsePrimaryExpressionWithoutPostfix(Precedence precedence)
1188511881
throw new NotImplementedException(); // this should not occur because these tokens are produced and parsed immediately
1188611882
case SyntaxKind.InterpolatedStringToken:
1188711883
return this.ParseInterpolatedStringToken();
11884+
case SyntaxKind.SingleLineRawStringLiteralToken:
11885+
case SyntaxKind.Utf8SingleLineRawStringLiteralToken:
11886+
case SyntaxKind.MultiLineRawStringLiteralToken:
11887+
case SyntaxKind.Utf8MultiLineRawStringLiteralToken:
11888+
return this.ParseRawStringToken();
1188811889
case SyntaxKind.OpenParenToken:
1188911890
{
1189011891
return IsPossibleLambdaExpression(precedence) && this.TryParseLambdaExpression() is { } lambda

src/Compilers/CSharp/Portable/Parser/LanguageParser_InterpolatedString.cs

Lines changed: 149 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,89 @@
55
using System;
66
using System.Diagnostics;
77
using System.Diagnostics.CodeAnalysis;
8-
using System.Runtime.CompilerServices;
98
using System.Text;
109
using Microsoft.CodeAnalysis.PooledObjects;
1110
using Microsoft.CodeAnalysis.Text;
12-
using Roslyn.Utilities;
1311

1412
namespace Microsoft.CodeAnalysis.CSharp.Syntax.InternalSyntax
1513
{
1614
internal partial class LanguageParser
1715
{
18-
private ExpressionSyntax ParseInterpolatedStringToken()
16+
private LiteralExpressionSyntax ParseRawStringToken()
17+
{
18+
var originalToken = this.EatToken();
19+
20+
var expressionKind = SyntaxFacts.GetLiteralExpression(originalToken.Kind);
21+
Debug.Assert(expressionKind != SyntaxKind.None);
22+
23+
// We want to share as much code as possible with raw-interpolated-strings. Especially the code for dealing
24+
// with indentation removal and determining the 'value' of the string. As such, we will reinterpret this
25+
// raw string as an interpolated string with no $'s and no holes, and then extract out the content token
26+
// from that.
27+
28+
Debug.Assert(originalToken.Text is ['"', '"', '"', ..]);
29+
30+
var interpolatedString = ParseInterpolatedOrRawStringToken(originalToken, isInterpolatedString: false);
31+
32+
// Because there are no actual interpolations, we expect to only see a single text content node containing
33+
// the interpreted value of the raw string.
34+
Debug.Assert(interpolatedString.StringStartToken.Kind is SyntaxKind.InterpolatedSingleLineRawStringStartToken or SyntaxKind.InterpolatedMultiLineRawStringStartToken);
35+
Debug.Assert(interpolatedString.Contents is [InterpolatedStringTextSyntax]);
36+
37+
var interpolatedText = (InterpolatedStringTextSyntax)interpolatedString.Contents[0]!;
38+
39+
var diagnostics = getDiagnostics();
40+
41+
// We preserve everything from the original raw token. Except we use the computed value text from the
42+
// interpolated text token instead as long as we got no diagnostics for this raw string.
43+
var finalToken = SyntaxFactory
44+
.Literal(originalToken.GetLeadingTrivia(), originalToken.Text, originalToken.Kind, getTokenValue(), originalToken.GetTrailingTrivia())
45+
.WithDiagnosticsGreen(diagnostics);
46+
47+
return _syntaxFactory.LiteralExpression(expressionKind, finalToken);
48+
49+
DiagnosticInfo[] getDiagnostics()
50+
{
51+
var diagnosticsBuilder = ArrayBuilder<DiagnosticInfo>.GetInstance();
52+
53+
// And any diagnostics from the interpolated string as a whole.
54+
diagnosticsBuilder.AddRange(interpolatedString.GetDiagnostics());
55+
56+
// We may have diagnostics on the InterpolatedStringText node itself, but not on the text token inside it
57+
// (since we create it, and immediately add it to the InterpolatedStringText node). If so, move those over.
58+
// However, move them as they are relative to the text token, and now need to be relative to the start of
59+
// the token as a whole.
60+
Debug.Assert(!interpolatedText.TextToken.ContainsDiagnostics);
61+
var textTokenDiagnostics = MoveDiagnostics(interpolatedText.GetDiagnostics(), interpolatedString.StringStartToken.Width);
62+
if (textTokenDiagnostics != null)
63+
diagnosticsBuilder.AddRange(textTokenDiagnostics);
64+
65+
// if the original token had diagnostics, then we absolutely must have produced some diagnostics creating
66+
// the interpolated version. Note: the converse does not hold. Producing the interpolation may produce
67+
// indentation diagnostics, which are not something the lexer would have produced.
68+
if (originalToken.ContainsDiagnostics)
69+
Debug.Assert(diagnosticsBuilder.Count > 0);
70+
71+
return diagnosticsBuilder.ToArrayAndFree();
72+
}
73+
74+
string getTokenValue()
75+
{
76+
if (diagnostics.Length == 0)
77+
return interpolatedText.TextToken.GetValueText();
78+
79+
// Preserve what the lexer used to do here. In the presence of any diagnostics, the text of the raw
80+
// string minus the starting quotes is used as the value.
81+
var startIndex = 0;
82+
var originalText = originalToken.Text;
83+
while (startIndex < originalText.Length && originalText[startIndex] is '"')
84+
startIndex++;
85+
86+
return originalText[startIndex..];
87+
}
88+
}
89+
90+
private InterpolatedStringExpressionSyntax ParseInterpolatedStringToken()
1991
{
2092
// We don't want to make the scanner stateful (between tokens) if we can possibly avoid it.
2193
// The approach implemented here is
@@ -40,9 +112,23 @@ private ExpressionSyntax ParseInterpolatedStringToken()
40112
Debug.Assert(this.CurrentToken.Kind == SyntaxKind.InterpolatedStringToken);
41113
var originalToken = this.EatToken();
42114

43-
var originalText = originalToken.ValueText; // this is actually the source text
115+
Debug.Assert(originalToken.Text[0] is '$' or '@');
116+
117+
return ParseInterpolatedOrRawStringToken(originalToken, isInterpolatedString: true);
118+
}
119+
120+
/// <summary>
121+
/// Takes the token produced by the lexer for an (raw or regular) interpolated string or non-interpolated raw
122+
/// string literal and creates an actual parsed <see cref="InterpolatedStringExpressionSyntax"/> for the syntax
123+
/// tree. For an interpolated string, this will now contain all the holes parsed out as well. For a raw string
124+
/// this will contain a single <see cref="InterpolatedStringTextSyntax"/> for the contents of the raw string.
125+
/// </summary>
126+
private InterpolatedStringExpressionSyntax ParseInterpolatedOrRawStringToken(
127+
SyntaxToken originalToken,
128+
bool isInterpolatedString)
129+
{
130+
var originalText = originalToken.Text;
44131
var originalTextSpan = originalText.AsSpan();
45-
Debug.Assert(originalText[0] == '$' || originalText[0] == '@');
46132

47133
// compute the positions of the interpolations in the original string literal, if there was an error or not,
48134
// and where the open and close quotes can be found.
@@ -55,19 +141,35 @@ private ExpressionSyntax ParseInterpolatedStringToken()
55141
var needsDedentation = kind == Lexer.InterpolatedStringKind.MultiLineRaw && error == null;
56142

57143
var result = SyntaxFactory.InterpolatedStringExpression(getOpenQuote(), getContent(originalTextSpan), getCloseQuote());
144+
Debug.Assert(originalToken.ToFullString() == result.ToFullString()); // yield from text equals yield from node
145+
146+
#if DEBUG
147+
// In the raw string case, none of the added text tokens should have diagnostics. Any diagnostics should be
148+
// on their containing InterpolatedStringTextSyntax node instead.
149+
if (!isInterpolatedString)
150+
{
151+
foreach (var content in result.Contents)
152+
{
153+
if (content is InterpolatedStringTextSyntax interpolatedText)
154+
Debug.Assert(!interpolatedText.TextToken.ContainsDiagnostics);
155+
}
156+
}
157+
#endif
58158

59-
interpolations.Free();
60159
if (error != null)
61160
result = result.WithDiagnosticsGreen([error]);
62161

63-
Debug.Assert(originalToken.ToFullString() == result.ToFullString()); // yield from text equals yield from node
162+
interpolations.Free();
64163
return result;
65164

66165
void rescanInterpolation(out Lexer.InterpolatedStringKind kind, out SyntaxDiagnosticInfo? error, out Range openQuoteRange, ArrayBuilder<Lexer.Interpolation> interpolations, out Range closeQuoteRange)
67166
{
68167
using var tempLexer = new Lexer(SourceText.From(originalText), this.Options, allowPreprocessorDirectives: false);
69168
var info = default(Lexer.TokenInfo);
70-
tempLexer.ScanInterpolatedStringLiteralTop(ref info, out error, out kind, out openQuoteRange, interpolations, out closeQuoteRange);
169+
tempLexer.ScanInterpolatedOrRawStringLiteralTop(
170+
ref info, isInterpolatedString, out error, out kind, out openQuoteRange, interpolations, out closeQuoteRange);
171+
172+
Debug.Assert(isInterpolatedString || interpolations.Count == 0, "Non-interpolated parsing should never produce interpolations");
71173
}
72174

73175
SyntaxToken getOpenQuote()
@@ -109,7 +211,7 @@ CodeAnalysis.Syntax.InternalSyntax.SyntaxList<InterpolatedStringContentSyntax> g
109211
// Make sure the interpolation starts at the right location.
110212
var indentationError = getInterpolationIndentationError(indentationWhitespace, interpolation);
111213
if (indentationError != null)
112-
interpolationNode = interpolationNode.WithDiagnosticsGreen(new[] { indentationError });
214+
interpolationNode = interpolationNode.WithDiagnosticsGreen([indentationError]);
113215

114216
builder.Add(interpolationNode);
115217
currentContentStart = interpolation.CloseBraceRange.End;
@@ -146,8 +248,17 @@ ReadOnlySpan<char> getIndentationWhitespace(ReadOnlySpan<char> originalTextSpan)
146248
InterpolatedStringContentSyntax? makeContent(
147249
ReadOnlySpan<char> indentationWhitespace, StringBuilder content, bool isFirst, bool isLast, ReadOnlySpan<char> text)
148250
{
149-
if (text.Length == 0)
150-
return null;
251+
if (text.IsEmpty)
252+
{
253+
// For the raw string case, always include an InterpolatedStringText token, even if empty. This
254+
// allows the caller to uniformly assume there is always at least one text token that it can
255+
// extract data from.
256+
return isInterpolatedString
257+
? null
258+
: SyntaxFactory.InterpolatedStringText(
259+
SyntaxFactory.Literal(leading: null, "", SyntaxKind.InterpolatedStringTextToken, "", trailing: null));
260+
261+
}
151262

152263
// If we're not dedenting then just make a standard interpolated text token. Also, we can short-circuit
153264
// if the indentation whitespace is empty (nothing to dedent in that case).
@@ -222,11 +333,13 @@ ReadOnlySpan<char> getIndentationWhitespace(ReadOnlySpan<char> originalTextSpan)
222333
var textString = text.ToString();
223334
var valueString = indentationError != null ? textString : content.ToString();
224335

336+
// Note: we place errors on the InterpolatedStringText node itself, not on the token. This is an
337+
// invariant that higher up callers can depend on.
225338
var node = SyntaxFactory.InterpolatedStringText(
226339
SyntaxFactory.Literal(leading: null, textString, SyntaxKind.InterpolatedStringTextToken, valueString, trailing: null));
227340

228341
return indentationError != null
229-
? node.WithDiagnosticsGreen(new[] { indentationError })
342+
? node.WithDiagnosticsGreen([indentationError])
230343
: node;
231344
}
232345

@@ -285,6 +398,24 @@ SyntaxToken getCloseQuote()
285398
}
286399
}
287400

401+
/// <summary>
402+
/// Converts a whitespace character to its string representation for error messages.
403+
/// </summary>
404+
private static string CharToString(char ch)
405+
{
406+
return ch switch
407+
{
408+
'\t' => @"\t",
409+
'\v' => @"\v",
410+
'\f' => @"\f",
411+
_ => @$"\u{(int)ch:x4}",
412+
};
413+
}
414+
415+
/// <summary>
416+
/// Checks if two whitespace sequences differ at a specific character position where both
417+
/// characters are whitespace but different types (e.g., tab vs space).
418+
/// </summary>
288419
private static bool CheckForSpaceDifference(
289420
ReadOnlySpan<char> currentLineWhitespace,
290421
ReadOnlySpan<char> indentationLineWhitespace,
@@ -300,8 +431,8 @@ private static bool CheckForSpaceDifference(
300431
SyntaxFacts.IsWhitespace(currentLineChar) &&
301432
SyntaxFacts.IsWhitespace(indentationLineChar))
302433
{
303-
currentLineMessage = Lexer.CharToString(currentLineChar);
304-
indentationLineMessage = Lexer.CharToString(indentationLineChar);
434+
currentLineMessage = CharToString(currentLineChar);
435+
indentationLineMessage = CharToString(indentationLineChar);
305436
return true;
306437
}
307438
}
@@ -469,9 +600,11 @@ private SyntaxToken MakeInterpolatedStringTextToken(Lexer.InterpolatedStringKind
469600
return result;
470601
}
471602

472-
private static DiagnosticInfo[] MoveDiagnostics(DiagnosticInfo[] infos, int offset)
603+
private static DiagnosticInfo[]? MoveDiagnostics(DiagnosticInfo[]? infos, int offset)
473604
{
474-
Debug.Assert(infos.Length > 0);
605+
if (infos is null or [])
606+
return null;
607+
475608
var builder = ArrayBuilder<DiagnosticInfo>.GetInstance(infos.Length);
476609
foreach (var info in infos)
477610
{

0 commit comments

Comments
 (0)