Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

When producing character literals, surrogate characters should be escaped. #20720

Merged
merged 8 commits into from
Jul 31, 2017
44 changes: 39 additions & 5 deletions src/Compilers/CSharp/Portable/SymbolDisplay/ObjectDisplay.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System;
using System.Diagnostics;
using System.Globalization;
using System.Reflection;
using System.Text;
Expand Down Expand Up @@ -175,13 +176,24 @@ private static bool TryReplaceChar(char c, out string replaceWith)
return true;
}

switch (CharUnicodeInfo.GetUnicodeCategory(c))
if (NeedsEscaping(CharUnicodeInfo.GetUnicodeCategory(c)))
{
replaceWith = "\\u" + ((int)c).ToString("x4");
return true;
}

return false;
}

private static bool NeedsEscaping(UnicodeCategory category)
{
switch (category)
{
case UnicodeCategory.Control:
case UnicodeCategory.OtherNotAssigned:
case UnicodeCategory.ParagraphSeparator:
case UnicodeCategory.LineSeparator:
replaceWith = "\\u" + ((int)c).ToString("x4");
case UnicodeCategory.Surrogate:
return true;
default:
return false;
Expand Down Expand Up @@ -223,10 +235,32 @@ public static string FormatLiteral(string value, ObjectDisplayOptions options)
builder.Append(quote);
}

foreach (var c in value)
for (int i = 0; i < value.Length; i++)
{
string replaceWith;
if (escapeNonPrintable && TryReplaceChar(c, out replaceWith))
char c = value[i];
if (escapeNonPrintable && CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.Surrogate)
{
var category = CharUnicodeInfo.GetUnicodeCategory(value, i);
if (category == UnicodeCategory.Surrogate)
{
// an unpaired surrogate
builder.Append("\\u" + ((int)c).ToString("x4"));
}
else if (NeedsEscaping(category))
{
// a surrogate pair that needs to be escaped
var unicode = char.ConvertToUtf32(value, i);
builder.Append("\\U" + unicode.ToString("x8"));
i++; // skip the already-encoded second surrogate of the pair
}
else
{
// copy a printable surrogate pair directly
builder.Append(c);
builder.Append(value[++i]);
}
}
else if (escapeNonPrintable && TryReplaceChar(c, out var replaceWith))
{
builder.Append(replaceWith);
}
Expand Down
8 changes: 8 additions & 0 deletions src/Compilers/CSharp/Test/Syntax/Syntax/SyntaxFactoryTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,14 @@ public void TestEscapeLineSeparator()
Assert.Equal("\"\\u2028\"", literal.Text);
}

[WorkItem(20693, "https://github.com/dotnet/roslyn/issues/20693")]
[Fact]
public void TestEscapeSurrogate()
{
var literal = SyntaxFactory.Literal('\uDBFF');
Assert.Equal("'\\udbff'", literal.Text);
}

private static void CheckLiteralToString(dynamic value, string expected)
{
var literal = SyntaxFactory.Literal(value);
Expand Down
29 changes: 27 additions & 2 deletions src/Compilers/VisualBasic/Portable/SymbolDisplay/ObjectDisplay.vb
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ Namespace Microsoft.CodeAnalysis.VisualBasic.ObjectDisplay
Dim wellKnown As String
Dim shouldEscape As Boolean
Dim isCrLf As Boolean
Dim copyPair = False

If Not escapeNonPrintable Then
wellKnown = Nothing
Expand All @@ -392,6 +393,12 @@ Namespace Microsoft.CodeAnalysis.VisualBasic.ObjectDisplay
shouldEscape = True
isCrLf = True
i += 1
ElseIf CharUnicodeInfo.GetUnicodeCategory(c) = UnicodeCategory.Surrogate AndAlso IsPrintable(CharUnicodeInfo.GetUnicodeCategory(str, i - 1)) Then
' copy properly paired surrogates directly into the resulting output
wellKnown = Nothing
shouldEscape = False
isCrLf = False
copyPair = True
Else
wellKnown = GetWellKnownCharacterName(c)
shouldEscape = wellKnown IsNot Nothing OrElse Not IsPrintable(c)
Expand Down Expand Up @@ -462,6 +469,12 @@ Namespace Microsoft.CodeAnalysis.VisualBasic.ObjectDisplay
Yield Quotes()
Else
Yield Character(c)
If copyPair Then
' copy the second character of a Unicode surrogate pair
c = str(i)
i += 1
Yield Character(c)
End If
End If
End If
End While
Expand All @@ -472,8 +485,20 @@ Namespace Microsoft.CodeAnalysis.VisualBasic.ObjectDisplay
End Function

Friend Function IsPrintable(c As Char) As Boolean
Dim category = CharUnicodeInfo.GetUnicodeCategory(c)
Return category <> UnicodeCategory.OtherNotAssigned AndAlso category <> UnicodeCategory.ParagraphSeparator AndAlso category <> UnicodeCategory.Control
Return IsPrintable(CharUnicodeInfo.GetUnicodeCategory(c))
End Function

Private Function IsPrintable(category As UnicodeCategory) As Boolean
Select Case category
Case UnicodeCategory.OtherNotAssigned,
UnicodeCategory.ParagraphSeparator,
UnicodeCategory.Control,
UnicodeCategory.LineSeparator,
UnicodeCategory.Surrogate
Return False
Case Else
Return True
End Select
End Function

Friend Function GetWellKnownCharacterName(c As Char) As String
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,30 @@ Namespace Microsoft.CodeAnalysis.VisualBasic.UnitTests
Assert.Equal("""a"" & vbVerticalTab", FormatPrimitiveUsingHexadecimalNumbers("a" & vbVerticalTab, quoteStrings:=True))
End Sub

<Fact>
Public Sub TextForEscapedStringLiterals_01()
Dim literal = SyntaxFactory.Literal(ChrW(&H2028) & "x") ' U+2028 is a line separator
Assert.Equal("ChrW(8232) & ""x""", literal.Text)
literal = SyntaxFactory.Literal(ChrW(&HDBFF)) ' U+DBFF is a unicode surrogate
Assert.Equal("ChrW(56319)", literal.Text)
End Sub
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider testing ObjectDisplay.FormatLiteral directly. Perhaps:

Assert.Equal("...",
    ObjectDisplay.FormatLiteral(
        ChrW(&HD83C) & ChrW(&HDFC8),
        ObjectDisplayOptions.UseQuotes Or ObjectDisplayOptions.EscapeNonPrintableCharacters))

And a similar test for C# ObjectDisplayTests:

Assert.Equal("...",
    ObjectDisplay.FormatLiteral("\ud83c", ObjectDisplayOptions.EscapeNonPrintableCharacters));
Assert.Equal("...",
    ObjectDisplay.FormatLiteral("\ud83c\udfc8", ObjectDisplayOptions.EscapeNonPrintableCharacters));

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ObjectDisplay is not a public API. But it is used in SymbolDisplay.FormatPrimitive, which I can test.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The methods in this class are directly testing ObjectDisplay.


<Fact>
Public Sub TextForEscapedStringLiterals_02()
' Well-ordered surrogate characters
Dim footBall = "🏈"
Assert.Equal(footBall, ObjectDisplay.FormatPrimitive(footBall, ObjectDisplayOptions.None))
Assert.Equal("""" & footBall & """", ObjectDisplay.FormatPrimitive(footBall, ObjectDisplayOptions.UseQuotes Or ObjectDisplayOptions.EscapeNonPrintableCharacters Or ObjectDisplayOptions.UseHexadecimalNumbers))
Assert.Equal("""" & footBall & """", ObjectDisplay.FormatPrimitive(footBall, ObjectDisplayOptions.UseQuotes Or ObjectDisplayOptions.EscapeNonPrintableCharacters))

' Misordered surrogate characters
Dim trash = ChrW(&HDFC8) & ChrW(&HD83C)
Assert.Equal(trash, ObjectDisplay.FormatPrimitive(trash, ObjectDisplayOptions.None))
Assert.Equal("ChrW(&HDFC8) & ChrW(&HD83C)", ObjectDisplay.FormatPrimitive(trash, ObjectDisplayOptions.UseQuotes Or ObjectDisplayOptions.EscapeNonPrintableCharacters Or ObjectDisplayOptions.UseHexadecimalNumbers))
Assert.Equal("ChrW(57288) & ChrW(55356)", ObjectDisplay.FormatPrimitive(trash, ObjectDisplayOptions.UseQuotes Or ObjectDisplayOptions.EscapeNonPrintableCharacters))

End Sub

<Fact>
Public Sub Strings_QuotesAndEscaping()
Assert.Equal(QuoteAndEscapingCombinations("a"), {"a", """a""", """a"""})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,9 +213,18 @@ public void String()
Assert.Equal("\"\\uffff\"", FormatValue(s));
Assert.Equal("\"\\uffff\"", FormatValue(s, useHexadecimal: true));

string multiByte = "\ud83c\udfc8";
string multiByte = "\ud83c\udfc8"; // unicode surrogates properly paired representing a printable Unicode codepoint
Assert.Equal(string.Format(format, "🏈"), FormatValue(multiByte));
Assert.Equal(string.Format(format, "🏈"), FormatValue(multiByte, useHexadecimal: true));
Assert.Equal(multiByte, "🏈");

multiByte = "\udbff\udfff"; // unicode surrogates representing an unprintable Unicode codepoint
Assert.Equal(string.Format(format, "\\U0010ffff"), FormatValue(multiByte));
Assert.Equal(string.Format(format, "\\U0010ffff"), FormatValue(multiByte, useHexadecimal: true));

multiByte = "\udfc8\ud83c"; // unicode surrogates not properly paired (in the wrong order in this case)
Assert.Equal(string.Format(format, "\\udfc8\\ud83c"), FormatValue(multiByte));
Assert.Equal(string.Format(format, "\\udfc8\\ud83c"), FormatValue(multiByte, useHexadecimal: true));
}

private static string FormatStringChar(char c)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,11 @@ Namespace Microsoft.CodeAnalysis.VisualBasic.ExpressionEvaluator.UnitTests
Dim multiByte = ChrW(&HD83C) & ChrW(&HDFC8)
Assert.Equal("""🏈""", FormatValue(multiByte))
Assert.Equal("""🏈""", FormatValue(multiByte, useHexadecimal:=True))
Assert.Equal("🏈", multiByte)

multiByte = ChrW(&HDFC8) & ChrW(&HD83C)
Assert.Equal("ChrW(57288) & ChrW(55356)", FormatValue(multiByte))
Assert.Equal("ChrW(&HDFC8) & ChrW(&HD83C)", FormatValue(multiByte, useHexadecimal:=True))
End Sub

<Fact>
Expand Down