Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for extended Unicode escape sequences in strings and templates #2169

Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
23 changes: 0 additions & 23 deletions src/compiler/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -623,29 +623,6 @@ module ts {
"\u0085": "\\u0085" // nextLine
};

/**
* Based heavily on the abstract 'Quote'/ 'QuoteJSONString' operation from ECMA-262 (24.3.2.2),
* but augmented for a few select characters.
* Note that this doesn't actually wrap the input in double quotes.
*/
export function escapeString(s: string): string {
// Prioritize '"' and '\'
s = backslashOrDoubleQuote.test(s) ? s.replace(backslashOrDoubleQuote, getReplacement) : s;
s = escapedCharsRegExp.test(s) ? s.replace(escapedCharsRegExp, getReplacement) : s;

return s;

function getReplacement(c: string) {
return escapedCharsMap[c] || unicodeEscape(c);
}

function unicodeEscape(c: string): string {
var hexCharCode = c.charCodeAt(0).toString(16);
var paddedHexCode = ("0000" + hexCharCode).slice(-4);
return "\\u" + paddedHexCode;
}
}

export function getDefaultLibFileName(options: CompilerOptions): string {
return options.target === ScriptTarget.ES6 ? "lib.es6.d.ts" : "lib.d.ts";
}
Expand Down
2 changes: 2 additions & 0 deletions src/compiler/diagnosticInformationMap.generated.ts
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ module ts {
Catch_clause_variable_name_must_be_an_identifier: { code: 1195, category: DiagnosticCategory.Error, key: "Catch clause variable name must be an identifier." },
Catch_clause_variable_cannot_have_a_type_annotation: { code: 1196, category: DiagnosticCategory.Error, key: "Catch clause variable cannot have a type annotation." },
Catch_clause_variable_cannot_have_an_initializer: { code: 1197, category: DiagnosticCategory.Error, key: "Catch clause variable cannot have an initializer." },
An_extended_Unicode_escape_value_must_be_between_0x0_and_0x10FFFF_inclusive: { code: 1198, category: DiagnosticCategory.Error, key: "An extended Unicode escape value must be between 0x0 and 0x10FFFF inclusive." },
expected: { code: 1199, category: DiagnosticCategory.Error, key: "'}' expected." },
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'Unterminated unicode escape'

Duplicate_identifier_0: { code: 2300, category: DiagnosticCategory.Error, key: "Duplicate identifier '{0}'." },
Initializer_of_instance_member_variable_0_cannot_reference_identifier_1_declared_in_the_constructor: { code: 2301, category: DiagnosticCategory.Error, key: "Initializer of instance member variable '{0}' cannot reference identifier '{1}' declared in the constructor." },
Static_members_cannot_reference_class_type_parameters: { code: 2302, category: DiagnosticCategory.Error, key: "Static members cannot reference class type parameters." },
Expand Down
9 changes: 8 additions & 1 deletion src/compiler/diagnosticMessages.json
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,14 @@
"category": "Error",
"code": 1197
},

"An extended Unicode escape value must be between 0x0 and 0x10FFFF inclusive.": {
"category": "Error",
"code": 1198
},
"'}' expected.": {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have "'{0}' expected."" for this.

"category": "Error",
"code": 1199
},
"Duplicate identifier '{0}'.": {
"category": "Error",
"code": 2300
Expand Down
18 changes: 12 additions & 6 deletions src/compiler/emitter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2191,9 +2191,12 @@ module ts {
}

function emitLiteral(node: LiteralExpression) {
var text = languageVersion < ScriptTarget.ES6 && isTemplateLiteralKind(node.kind) ? getTemplateLiteralAsStringLiteral(node) :
node.parent ? getSourceTextOfNodeFromSourceFile(currentSourceFile, node) :
node.text;
var text = languageVersion < ScriptTarget.ES6 && (isTemplateLiteralKind(node.kind) || node.hasExtendedUnicodeEscape)
? getDoubleQuotedStringTextOfLiteral(node)
: node.parent
? getSourceTextOfNodeFromSourceFile(currentSourceFile, node)
: node.text; // TODO(drosen): Is this correct?

if (compilerOptions.sourceMap && (node.kind === SyntaxKind.StringLiteral || isTemplateLiteralKind(node.kind))) {
writer.writeLiteral(text);
}
Expand All @@ -2205,9 +2208,12 @@ module ts {
write(text);
}
}

function getTemplateLiteralAsStringLiteral(node: LiteralExpression): string {
return '"' + escapeString(node.text) + '"';

function getDoubleQuotedStringTextOfLiteral(node: LiteralExpression): string {
var result = escapeString(node.text);
result = replaceNonAsciiCharacters(result);

return '"' + result + '"';
}

function emitDownlevelRawTemplateLiteral(node: LiteralExpression) {
Expand Down
4 changes: 4 additions & 0 deletions src/compiler/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2163,6 +2163,10 @@ module ts {
var text = scanner.getTokenValue();
node.text = internName ? internIdentifier(text) : text;

if (scanner.hasExtendedUnicodeEscape()) {
node.hasExtendedUnicodeEscape = true;
}

if (scanner.isUnterminated()) {
node.isUnterminated = true;
}
Expand Down
87 changes: 78 additions & 9 deletions src/compiler/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ module ts {
getTokenPos(): number;
getTokenText(): string;
getTokenValue(): string;
hasExtendedUnicodeEscape(): boolean;
hasPrecedingLineBreak(): boolean;
isIdentifier(): boolean;
isReservedWord(): boolean;
Expand Down Expand Up @@ -556,6 +557,7 @@ module ts {
var token: SyntaxKind;
var tokenValue: string;
var precedingLineBreak: boolean;
var hasExtendedUnicodeEscape: boolean;
var tokenIsUnterminated: boolean;

function error(message: DiagnosticMessage, length?: number): void {
Expand Down Expand Up @@ -606,11 +608,21 @@ module ts {
}
return +(text.substring(start, pos));
}

function scanExactNumberOfHexDigits(count: number): number {
return scanHexDigits(/*minCount*/ count, /*maxCount*/ count);
}

function scanMinimumNumberOfHexDigits(count: number): number {
return scanHexDigits(/*minCount*/ count, /*maxCount*/ undefined);
}

function scanHexDigits(count: number, mustMatchCount?: boolean): number {
function scanHexDigits(minCount: number, maxCount?: number): number {
var maxCountSpecified = maxCount !== undefined;

var digits = 0;
var value = 0;
while (digits < count || !mustMatchCount) {
while (!maxCountSpecified || digits < maxCount) {
var ch = text.charCodeAt(pos);
if (ch >= CharacterCodes._0 && ch <= CharacterCodes._9) {
value = value * 16 + ch - CharacterCodes._0;
Expand All @@ -627,7 +639,7 @@ module ts {
pos++;
digits++;
}
if (digits < count) {
if (digits < minCount) {
value = -1;
}
return value;
Expand Down Expand Up @@ -764,11 +776,18 @@ module ts {
return "\'";
case CharacterCodes.doubleQuote:
return "\"";
case CharacterCodes.x:
case CharacterCodes.u:
var ch = scanHexDigits(ch === CharacterCodes.x ? 2 : 4, /*mustMatchCount*/ true);
if (ch >= 0) {
return String.fromCharCode(ch);
if (pos < len && text.charCodeAt(pos) === CharacterCodes.openBrace) {
hasExtendedUnicodeEscape = true;
pos++;
return scanExtendedUnicodeEscape();
}

// fall through
case CharacterCodes.x:
var escapedValue = scanExactNumberOfHexDigits(ch === CharacterCodes.x ? 2 : 4);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you just extract this code, and then call it from the 'x' or 'u' cases?

if (escapedValue >= 0) {
return String.fromCharCode(escapedValue);
}
else {
error(Diagnostics.Hexadecimal_digit_expected);
Expand All @@ -790,14 +809,62 @@ module ts {
return String.fromCharCode(ch);
}
}

function scanExtendedUnicodeEscape(): string {
var escapedValue = scanMinimumNumberOfHexDigits(1);
var isInvalidExtendedEscape = false;

// Validate the value of the digit
if (escapedValue < 0) {
error(Diagnostics.Hexadecimal_digit_expected)
isInvalidExtendedEscape = true;
}
else if (escapedValue > 0x10FFFF) {
error(Diagnostics.An_extended_Unicode_escape_value_must_be_between_0x0_and_0x10FFFF_inclusive);
isInvalidExtendedEscape = true;
}

if (pos >= len) {
error(Diagnostics.Unexpected_end_of_text);
isInvalidExtendedEscape = true;
}
else if (text.charCodeAt(pos) == CharacterCodes.closeBrace) {
// Only swallow the following character up if it's a '}'.
pos++;
}
else {
error(Diagnostics.expected); // '}' expected.
isInvalidExtendedEscape = true;
}

if (isInvalidExtendedEscape) {
return "";
}

return utf16EncodeAsString(escapedValue);
}

// Derived from the 10.1.1 UTF16Encoding of the ES6 Spec.
function utf16EncodeAsString(codePoint: number): string {
Debug.assert(0x0 <= codePoint && codePoint <= 0x10FFFF);

if (codePoint <= 65535) {
return String.fromCharCode(codePoint);
}

var codeUnit1 = Math.floor((codePoint - 65536) / 1024) + 0xD800;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you just refernece the relevant portion of theES6 spec?

Can you just do "|0" to floor instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's exactly the same as 10.1.1

var codeUnit2 = ((codePoint - 65536) % 1024) + 0xDC00;

return String.fromCharCode(codeUnit1, codeUnit2);
}

// Current character is known to be a backslash. Check for Unicode escape of the form '\uXXXX'
// and return code point value if valid Unicode escape is found. Otherwise return -1.
function peekUnicodeEscape(): number {
if (pos + 5 < len && text.charCodeAt(pos + 1) === CharacterCodes.u) {
var start = pos;
pos += 2;
var value = scanHexDigits(4, /*mustMatchCount*/ true);
var value = scanExactNumberOfHexDigits(4);
pos = start;
return value;
}
Expand Down Expand Up @@ -869,6 +936,7 @@ module ts {

function scan(): SyntaxKind {
startPos = pos;
hasExtendedUnicodeEscape = false;
precedingLineBreak = false;
tokenIsUnterminated = false;
while (true) {
Expand Down Expand Up @@ -1034,7 +1102,7 @@ module ts {
case CharacterCodes._0:
if (pos + 2 < len && (text.charCodeAt(pos + 1) === CharacterCodes.X || text.charCodeAt(pos + 1) === CharacterCodes.x)) {
pos += 2;
var value = scanHexDigits(1, /*mustMatchCount*/ false);
var value = scanMinimumNumberOfHexDigits(1);
if (value < 0) {
error(Diagnostics.Hexadecimal_digit_expected);
value = 0;
Expand Down Expand Up @@ -1336,6 +1404,7 @@ module ts {
getTokenPos: () => tokenPos,
getTokenText: () => text.substring(tokenPos, pos),
getTokenValue: () => tokenValue,
hasExtendedUnicodeEscape: () => hasExtendedUnicodeEscape,
hasPrecedingLineBreak: () => precedingLineBreak,
isIdentifier: () => token === SyntaxKind.Identifier || token > SyntaxKind.LastReservedWord,
isReservedWord: () => token >= SyntaxKind.FirstReservedWord && token <= SyntaxKind.LastReservedWord,
Expand Down
1 change: 1 addition & 0 deletions src/compiler/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,7 @@ module ts {
export interface LiteralExpression extends PrimaryExpression {
text: string;
isUnterminated?: boolean;
hasExtendedUnicodeEscape?: boolean;
}

export interface StringLiteralExpression extends LiteralExpression {
Expand Down
51 changes: 50 additions & 1 deletion src/compiler/utilities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1130,7 +1130,7 @@ module ts {
newEndN = Math.max(newEnd2, newEnd2 + (newEnd1 - oldEnd2));
}

return createTextChangeRange(createTextSpanFromBounds(oldStartN, oldEndN), /*newLength: */newEndN - oldStartN);
return createTextChangeRange(createTextSpanFromBounds(oldStartN, oldEndN), /*newLength: */ newEndN - oldStartN);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove space after :

}

// @internal
Expand Down Expand Up @@ -1212,4 +1212,53 @@ module ts {
}
}
}

var backslashOrDoubleQuote = /[\"\\]/g;
var escapedCharsRegExp = /[\u0000-\u001f\t\v\f\b\r\n\u2028\u2029\u0085]/g;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Document this.

var escapedCharsMap: Map<string> = {
"\0": "\\0",
"\t": "\\t",
"\v": "\\v",
"\f": "\\f",
"\b": "\\b",
"\r": "\\r",
"\n": "\\n",
"\\": "\\\\",
"\"": "\\\"",
"\u2028": "\\u2028", // lineSeparator
"\u2029": "\\u2029", // paragraphSeparator
"\u0085": "\\u0085" // nextLine
};

/**
* Based heavily on the abstract 'Quote'/ 'QuoteJSONString' operation from ECMA-262 (24.3.2.2),
* but augmented for a few select characters.
* Note that this doesn't actually wrap the input in double quotes.
*/
export function escapeString(s: string): string {
// Prioritize '"' and '\'
s = backslashOrDoubleQuote.test(s) ? s.replace(backslashOrDoubleQuote, getReplacement) : s;
s = escapedCharsRegExp.test(s) ? s.replace(escapedCharsRegExp, getReplacement) : s;

return s;

function getReplacement(c: string) {
return escapedCharsMap[c] || get16BitUnicodeEscapeSequence(c.charCodeAt(0));
}
}

function get16BitUnicodeEscapeSequence(charCode: number): string {
var hexCharCode = charCode.toString(16).toUpperCase();
var paddedHexCode = ("0000" + hexCharCode).slice(-4);
return "\\u" + paddedHexCode;
}

var nonAsciiCharacters = /[^\u0000-\u007F]/g;
export function replaceNonAsciiCharacters(s: string): string {
// Replace non-ASCII characters with '\uNNNN' escapes if any exist.
// Otherwise just return the original string.
return nonAsciiCharacters.test(s) ?
s.replace(nonAsciiCharacters, c => get16BitUnicodeEscapeSequence(c.charCodeAt(0))) :
s;
}
}
2 changes: 2 additions & 0 deletions tests/baselines/reference/APISample_compile.js
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ declare module "typescript" {
interface LiteralExpression extends PrimaryExpression {
text: string;
isUnterminated?: boolean;
hasExtendedUnicodeEscape?: boolean;
}
interface StringLiteralExpression extends LiteralExpression {
_stringLiteralExpressionBrand: any;
Expand Down Expand Up @@ -1420,6 +1421,7 @@ declare module "typescript" {
getTokenPos(): number;
getTokenText(): string;
getTokenValue(): string;
hasExtendedUnicodeEscape(): boolean;
hasPrecedingLineBreak(): boolean;
isIdentifier(): boolean;
isReservedWord(): boolean;
Expand Down
6 changes: 6 additions & 0 deletions tests/baselines/reference/APISample_compile.types
Original file line number Diff line number Diff line change
Expand Up @@ -1664,6 +1664,9 @@ declare module "typescript" {

isUnterminated?: boolean;
>isUnterminated : boolean

hasExtendedUnicodeEscape?: boolean;
>hasExtendedUnicodeEscape : boolean
}
interface StringLiteralExpression extends LiteralExpression {
>StringLiteralExpression : StringLiteralExpression
Expand Down Expand Up @@ -4477,6 +4480,9 @@ declare module "typescript" {
getTokenValue(): string;
>getTokenValue : () => string

hasExtendedUnicodeEscape(): boolean;
>hasExtendedUnicodeEscape : () => boolean

hasPrecedingLineBreak(): boolean;
>hasPrecedingLineBreak : () => boolean

Expand Down
2 changes: 2 additions & 0 deletions tests/baselines/reference/APISample_linter.js
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,7 @@ declare module "typescript" {
interface LiteralExpression extends PrimaryExpression {
text: string;
isUnterminated?: boolean;
hasExtendedUnicodeEscape?: boolean;
}
interface StringLiteralExpression extends LiteralExpression {
_stringLiteralExpressionBrand: any;
Expand Down Expand Up @@ -1451,6 +1452,7 @@ declare module "typescript" {
getTokenPos(): number;
getTokenText(): string;
getTokenValue(): string;
hasExtendedUnicodeEscape(): boolean;
hasPrecedingLineBreak(): boolean;
isIdentifier(): boolean;
isReservedWord(): boolean;
Expand Down
Loading