-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for extended Unicode escape sequences in strings and templates #2169
Changes from 11 commits
f16e875
ee71954
1e602de
6ad1780
bbf9579
a81bf8c
9d89668
4657c2d
cad8f6b
3c34478
5437b3d
b1837c8
6be13a9
5ec68eb
f9cc013
5c5a489
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -611,7 +611,14 @@ | |
"category": "Error", | ||
"code": 1197 | ||
}, | ||
|
||
"An extended Unicode escape value must be between 0x0 and 0x10FFFF inclusive.": { | ||
"category": "Error", | ||
"code": 1198 | ||
}, | ||
"'}' expected.": { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We already have "'{0}' expected."" for this. |
||
"category": "Error", | ||
"code": 1199 | ||
}, | ||
"Duplicate identifier '{0}'.": { | ||
"category": "Error", | ||
"code": 2300 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,7 @@ module ts { | |
getTokenPos(): number; | ||
getTokenText(): string; | ||
getTokenValue(): string; | ||
hasExtendedUnicodeEscape(): boolean; | ||
hasPrecedingLineBreak(): boolean; | ||
isIdentifier(): boolean; | ||
isReservedWord(): boolean; | ||
|
@@ -556,6 +557,7 @@ module ts { | |
var token: SyntaxKind; | ||
var tokenValue: string; | ||
var precedingLineBreak: boolean; | ||
var hasExtendedUnicodeEscape: boolean; | ||
var tokenIsUnterminated: boolean; | ||
|
||
function error(message: DiagnosticMessage, length?: number): void { | ||
|
@@ -606,11 +608,21 @@ module ts { | |
} | ||
return +(text.substring(start, pos)); | ||
} | ||
|
||
function scanExactNumberOfHexDigits(count: number): number { | ||
return scanHexDigits(/*minCount*/ count, /*maxCount*/ count); | ||
} | ||
|
||
function scanMinimumNumberOfHexDigits(count: number): number { | ||
return scanHexDigits(/*minCount*/ count, /*maxCount*/ undefined); | ||
} | ||
|
||
function scanHexDigits(count: number, mustMatchCount?: boolean): number { | ||
function scanHexDigits(minCount: number, maxCount?: number): number { | ||
var maxCountSpecified = maxCount !== undefined; | ||
|
||
var digits = 0; | ||
var value = 0; | ||
while (digits < count || !mustMatchCount) { | ||
while (!maxCountSpecified || digits < maxCount) { | ||
var ch = text.charCodeAt(pos); | ||
if (ch >= CharacterCodes._0 && ch <= CharacterCodes._9) { | ||
value = value * 16 + ch - CharacterCodes._0; | ||
|
@@ -627,7 +639,7 @@ module ts { | |
pos++; | ||
digits++; | ||
} | ||
if (digits < count) { | ||
if (digits < minCount) { | ||
value = -1; | ||
} | ||
return value; | ||
|
@@ -764,11 +776,18 @@ module ts { | |
return "\'"; | ||
case CharacterCodes.doubleQuote: | ||
return "\""; | ||
case CharacterCodes.x: | ||
case CharacterCodes.u: | ||
var ch = scanHexDigits(ch === CharacterCodes.x ? 2 : 4, /*mustMatchCount*/ true); | ||
if (ch >= 0) { | ||
return String.fromCharCode(ch); | ||
if (pos < len && text.charCodeAt(pos) === CharacterCodes.openBrace) { | ||
hasExtendedUnicodeEscape = true; | ||
pos++; | ||
return scanExtendedUnicodeEscape(); | ||
} | ||
|
||
// fall through | ||
case CharacterCodes.x: | ||
var escapedValue = scanExactNumberOfHexDigits(ch === CharacterCodes.x ? 2 : 4); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you just extract this code, and then call it from the 'x' or 'u' cases? |
||
if (escapedValue >= 0) { | ||
return String.fromCharCode(escapedValue); | ||
} | ||
else { | ||
error(Diagnostics.Hexadecimal_digit_expected); | ||
|
@@ -790,14 +809,62 @@ module ts { | |
return String.fromCharCode(ch); | ||
} | ||
} | ||
|
||
function scanExtendedUnicodeEscape(): string { | ||
var escapedValue = scanMinimumNumberOfHexDigits(1); | ||
var isInvalidExtendedEscape = false; | ||
|
||
// Validate the value of the digit | ||
if (escapedValue < 0) { | ||
error(Diagnostics.Hexadecimal_digit_expected) | ||
isInvalidExtendedEscape = true; | ||
} | ||
else if (escapedValue > 0x10FFFF) { | ||
error(Diagnostics.An_extended_Unicode_escape_value_must_be_between_0x0_and_0x10FFFF_inclusive); | ||
isInvalidExtendedEscape = true; | ||
} | ||
|
||
if (pos >= len) { | ||
error(Diagnostics.Unexpected_end_of_text); | ||
isInvalidExtendedEscape = true; | ||
} | ||
else if (text.charCodeAt(pos) == CharacterCodes.closeBrace) { | ||
// Only swallow the following character up if it's a '}'. | ||
pos++; | ||
} | ||
else { | ||
error(Diagnostics.expected); // '}' expected. | ||
isInvalidExtendedEscape = true; | ||
} | ||
|
||
if (isInvalidExtendedEscape) { | ||
return ""; | ||
} | ||
|
||
return utf16EncodeAsString(escapedValue); | ||
} | ||
|
||
// Derived from the 10.1.1 UTF16Encoding of the ES6 Spec. | ||
function utf16EncodeAsString(codePoint: number): string { | ||
Debug.assert(0x0 <= codePoint && codePoint <= 0x10FFFF); | ||
|
||
if (codePoint <= 65535) { | ||
return String.fromCharCode(codePoint); | ||
} | ||
|
||
var codeUnit1 = Math.floor((codePoint - 65536) / 1024) + 0xD800; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you just refernece the relevant portion of theES6 spec? Can you just do "|0" to floor instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's exactly the same as 10.1.1 |
||
var codeUnit2 = ((codePoint - 65536) % 1024) + 0xDC00; | ||
|
||
return String.fromCharCode(codeUnit1, codeUnit2); | ||
} | ||
|
||
// Current character is known to be a backslash. Check for Unicode escape of the form '\uXXXX' | ||
// and return code point value if valid Unicode escape is found. Otherwise return -1. | ||
function peekUnicodeEscape(): number { | ||
if (pos + 5 < len && text.charCodeAt(pos + 1) === CharacterCodes.u) { | ||
var start = pos; | ||
pos += 2; | ||
var value = scanHexDigits(4, /*mustMatchCount*/ true); | ||
var value = scanExactNumberOfHexDigits(4); | ||
pos = start; | ||
return value; | ||
} | ||
|
@@ -869,6 +936,7 @@ module ts { | |
|
||
function scan(): SyntaxKind { | ||
startPos = pos; | ||
hasExtendedUnicodeEscape = false; | ||
precedingLineBreak = false; | ||
tokenIsUnterminated = false; | ||
while (true) { | ||
|
@@ -1034,7 +1102,7 @@ module ts { | |
case CharacterCodes._0: | ||
if (pos + 2 < len && (text.charCodeAt(pos + 1) === CharacterCodes.X || text.charCodeAt(pos + 1) === CharacterCodes.x)) { | ||
pos += 2; | ||
var value = scanHexDigits(1, /*mustMatchCount*/ false); | ||
var value = scanMinimumNumberOfHexDigits(1); | ||
if (value < 0) { | ||
error(Diagnostics.Hexadecimal_digit_expected); | ||
value = 0; | ||
|
@@ -1336,6 +1404,7 @@ module ts { | |
getTokenPos: () => tokenPos, | ||
getTokenText: () => text.substring(tokenPos, pos), | ||
getTokenValue: () => tokenValue, | ||
hasExtendedUnicodeEscape: () => hasExtendedUnicodeEscape, | ||
hasPrecedingLineBreak: () => precedingLineBreak, | ||
isIdentifier: () => token === SyntaxKind.Identifier || token > SyntaxKind.LastReservedWord, | ||
isReservedWord: () => token >= SyntaxKind.FirstReservedWord && token <= SyntaxKind.LastReservedWord, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1130,7 +1130,7 @@ module ts { | |
newEndN = Math.max(newEnd2, newEnd2 + (newEnd1 - oldEnd2)); | ||
} | ||
|
||
return createTextChangeRange(createTextSpanFromBounds(oldStartN, oldEndN), /*newLength: */newEndN - oldStartN); | ||
return createTextChangeRange(createTextSpanFromBounds(oldStartN, oldEndN), /*newLength: */ newEndN - oldStartN); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove space after : |
||
} | ||
|
||
// @internal | ||
|
@@ -1212,4 +1212,53 @@ module ts { | |
} | ||
} | ||
} | ||
|
||
var backslashOrDoubleQuote = /[\"\\]/g; | ||
var escapedCharsRegExp = /[\u0000-\u001f\t\v\f\b\r\n\u2028\u2029\u0085]/g; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Document this. |
||
var escapedCharsMap: Map<string> = { | ||
"\0": "\\0", | ||
"\t": "\\t", | ||
"\v": "\\v", | ||
"\f": "\\f", | ||
"\b": "\\b", | ||
"\r": "\\r", | ||
"\n": "\\n", | ||
"\\": "\\\\", | ||
"\"": "\\\"", | ||
"\u2028": "\\u2028", // lineSeparator | ||
"\u2029": "\\u2029", // paragraphSeparator | ||
"\u0085": "\\u0085" // nextLine | ||
}; | ||
|
||
/** | ||
* Based heavily on the abstract 'Quote'/ 'QuoteJSONString' operation from ECMA-262 (24.3.2.2), | ||
* but augmented for a few select characters. | ||
* Note that this doesn't actually wrap the input in double quotes. | ||
*/ | ||
export function escapeString(s: string): string { | ||
// Prioritize '"' and '\' | ||
s = backslashOrDoubleQuote.test(s) ? s.replace(backslashOrDoubleQuote, getReplacement) : s; | ||
s = escapedCharsRegExp.test(s) ? s.replace(escapedCharsRegExp, getReplacement) : s; | ||
|
||
return s; | ||
|
||
function getReplacement(c: string) { | ||
return escapedCharsMap[c] || get16BitUnicodeEscapeSequence(c.charCodeAt(0)); | ||
} | ||
} | ||
|
||
function get16BitUnicodeEscapeSequence(charCode: number): string { | ||
var hexCharCode = charCode.toString(16).toUpperCase(); | ||
var paddedHexCode = ("0000" + hexCharCode).slice(-4); | ||
return "\\u" + paddedHexCode; | ||
} | ||
|
||
var nonAsciiCharacters = /[^\u0000-\u007F]/g; | ||
export function replaceNonAsciiCharacters(s: string): string { | ||
// Replace non-ASCII characters with '\uNNNN' escapes if any exist. | ||
// Otherwise just return the original string. | ||
return nonAsciiCharacters.test(s) ? | ||
s.replace(nonAsciiCharacters, c => get16BitUnicodeEscapeSequence(c.charCodeAt(0))) : | ||
s; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
'Unterminated unicode escape'