Skip to content

Commit

Permalink
Recognize identifier characters from the Supplementary Multilingual P…
Browse files Browse the repository at this point in the history
…lane.

Fixes #1244.
  • Loading branch information
ariya committed Jul 21, 2015
1 parent 04f0a68 commit 014fa4c
Show file tree
Hide file tree
Showing 22 changed files with 675 additions and 33 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ with the help of [many contributors](https://github.com/jquery/esprima/contribut
- Full support for ECMAScript 5.1 ([ECMA-262](http://www.ecma-international.org/publications/standards/Ecma-262.htm))
- Sensible [syntax tree format](https://github.com/estree/estree/blob/master/spec.md) as standardized by [EStree project](https://github.com/estree/estree)
- Optional tracking of syntax node location (index-based and line-column)
- Heavily tested (~1100 [unit tests](https://github.com/jquery/esprima/tree/master/test/fixtures) with [full code coverage](https://travis-ci.org/jquery/esprima))
- Heavily tested (~1200 [unit tests](https://github.com/jquery/esprima/tree/master/test/fixtures) with [full code coverage](https://travis-ci.org/jquery/esprima))
- [Partial support](https://github.com/jquery/esprima/issues/1099) for ECMAScript 6

Esprima serves as a **building block** for some JavaScript
Expand Down
93 changes: 61 additions & 32 deletions esprima.js
Original file line number Diff line number Diff line change
Expand Up @@ -297,12 +297,18 @@

// 11.6 Identifier Names and Identifiers

function fromCodePoint(cp) {
return (cp < 0x10000) ? String.fromCharCode(cp) :
String.fromCharCode(0xD800 + ((cp - 0x10000) >> 10)) +
String.fromCharCode(0xDC00 + ((cp - 0x10000) & 1023));
}

function isIdentifierStart(ch) {
return (ch === 0x24) || (ch === 0x5F) || // $ (dollar) and _ (underscore)
(ch >= 0x41 && ch <= 0x5A) || // A..Z
(ch >= 0x61 && ch <= 0x7A) || // a..z
(ch === 0x5C) || // \ (backslash)
((ch >= 0x80) && Regex.NonAsciiIdentifierStart.test(String.fromCharCode(ch)));
((ch >= 0x80) && Regex.NonAsciiIdentifierStart.test(fromCodePoint(ch)));
}

function isIdentifierPart(ch) {
Expand All @@ -311,7 +317,7 @@
(ch >= 0x61 && ch <= 0x7A) || // a..z
(ch >= 0x30 && ch <= 0x39) || // 0..9
(ch === 0x5C) || // \ (backslash)
((ch >= 0x80) && Regex.NonAsciiIdentifierPart.test(String.fromCharCode(ch)));
((ch >= 0x80) && Regex.NonAsciiIdentifierPart.test(fromCodePoint(ch)));
}

// 11.6.2.2 Future Reserved Words
Expand Down Expand Up @@ -583,7 +589,7 @@
}

function scanUnicodeCodePointEscape() {
var ch, code, cu1, cu2;
var ch, code;

ch = source[index];
code = 0;
Expand All @@ -605,23 +611,31 @@
throwUnexpectedToken();
}

// UTF-16 Encoding
if (code <= 0xFFFF) {
return String.fromCharCode(code);
return fromCodePoint(code);
}

function codePointAt(i) {
var cp, first, second;

cp = source.charCodeAt(i);
if (cp >= 0xD800 && cp <= 0xDBFF) {
first = cp;
second = source.charCodeAt(i + 1);
cp = (first - 0xD800) * 0x400 + second - 0xDC00 + 0x10000;
}
cu1 = ((code - 0x10000) >> 10) + 0xD800;
cu2 = ((code - 0x10000) & 1023) + 0xDC00;
return String.fromCharCode(cu1, cu2);

return cp;
}

function getEscapedIdentifier() {
var ch, id;
function getComplexIdentifier() {
var cp, ch, id;

ch = source.charCodeAt(index++);
id = String.fromCharCode(ch);
cp = codePointAt(index);
id = fromCodePoint(cp);
index += id.length;

// '\u' (U+005C, U+0075) denotes an escaped character.
if (ch === 0x5C) {
if (cp === 0x5C) {
if (source.charCodeAt(index) !== 0x75) {
throwUnexpectedToken();
}
Expand All @@ -631,23 +645,25 @@
ch = scanUnicodeCodePointEscape();
} else {
ch = scanHexEscape('u');
if (!ch || ch === '\\' || !isIdentifierStart(ch.charCodeAt(0))) {
cp = ch.charCodeAt(0);
if (!ch || ch === '\\' || !isIdentifierStart(cp)) {
throwUnexpectedToken();
}
}
id = ch;
}

while (index < length) {
ch = source.charCodeAt(index);
if (!isIdentifierPart(ch)) {
cp = codePointAt(index);
if (!isIdentifierPart(cp)) {
break;
}
++index;
id += String.fromCharCode(ch);
ch = fromCodePoint(cp);
id += ch;
index += ch.length;

// '\u' (U+005C, U+0075) denotes an escaped character.
if (ch === 0x5C) {
if (cp === 0x5C) {
id = id.substr(0, id.length - 1);
if (source.charCodeAt(index) !== 0x75) {
throwUnexpectedToken();
Expand All @@ -658,7 +674,8 @@
ch = scanUnicodeCodePointEscape();
} else {
ch = scanHexEscape('u');
if (!ch || ch === '\\' || !isIdentifierPart(ch.charCodeAt(0))) {
cp = ch.charCodeAt(0);
if (!ch || ch === '\\' || !isIdentifierPart(cp)) {
throwUnexpectedToken();
}
}
Expand All @@ -678,7 +695,11 @@
if (ch === 0x5C) {
// Blackslash (U+005C) marks Unicode escape sequence.
index = start;
return getEscapedIdentifier();
return getComplexIdentifier();
} else if (ch >= 0xD800 && ch < 0xDFFF) {
// Need to handle surrogate pairs.
index = start;
return getComplexIdentifier();
}
if (isIdentifierPart(ch)) {
++index;
Expand All @@ -696,7 +717,7 @@
start = index;

// Backslash (U+005C) starts an escaped character.
id = (source.charCodeAt(index) === 0x5C) ? getEscapedIdentifier() : getIdentifier();
id = (source.charCodeAt(index) === 0x5C) ? getComplexIdentifier() : getIdentifier();

// There is no keyword or literal with only one character.
// Thus, it must be an identifier.
Expand Down Expand Up @@ -1550,7 +1571,7 @@
}

function advance() {
var ch, token;
var cp, token;

if (index >= length) {
return {
Expand All @@ -1562,9 +1583,9 @@
};
}

ch = source.charCodeAt(index);
cp = source.charCodeAt(index);

if (isIdentifierStart(ch)) {
if (isIdentifierStart(cp)) {
token = scanIdentifier();
if (strict && isStrictModeReservedWord(token.value)) {
token.type = Token.Keyword;
Expand All @@ -1573,39 +1594,47 @@
}

// Very common: ( and ) and ;
if (ch === 0x28 || ch === 0x29 || ch === 0x3B) {
if (cp === 0x28 || cp === 0x29 || cp === 0x3B) {
return scanPunctuator();
}

// String literal starts with single quote (U+0027) or double quote (U+0022).
if (ch === 0x27 || ch === 0x22) {
if (cp === 0x27 || cp === 0x22) {
return scanStringLiteral();
}

// Dot (.) U+002E can also start a floating-point number, hence the need
// to check the next character.
if (ch === 0x2E) {
if (cp === 0x2E) {
if (isDecimalDigit(source.charCodeAt(index + 1))) {
return scanNumericLiteral();
}
return scanPunctuator();
}

if (isDecimalDigit(ch)) {
if (isDecimalDigit(cp)) {
return scanNumericLiteral();
}

// Slash (/) U+002F can also start a regex.
if (extra.tokenize && ch === 0x2F) {
if (extra.tokenize && cp === 0x2F) {
return advanceSlash();
}

// Template literals start with ` (U+0060) for template head
// or } (U+007D) for template middle or template tail.
if (ch === 0x60 || (ch === 0x7D && state.curlyStack[state.curlyStack.length - 1] === '${')) {
if (cp === 0x60 || (cp === 0x7D && state.curlyStack[state.curlyStack.length - 1] === '${')) {
return scanTemplate();
}

// Possible identifier start in a surrogate pair.
if (cp >= 0xD800 && cp < 0xDFFF) {
cp = codePointAt(index);
if (isIdentifierStart(cp)) {
return scanIdentifier();
}
}

return scanPunctuator();
}

Expand Down
2 changes: 2 additions & 0 deletions test/fixtures/ES6/identifier/escaped_math_alef.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
var \u{1EE00}

75 changes: 75 additions & 0 deletions test/fixtures/ES6/identifier/escaped_math_alef.tree.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
{
"range": [
0,
13
],
"loc": {
"start": {
"line": 1,
"column": 0
},
"end": {
"line": 1,
"column": 13
}
},
"type": "Program",
"body": [
{
"range": [
0,
13
],
"loc": {
"start": {
"line": 1,
"column": 0
},
"end": {
"line": 1,
"column": 13
}
},
"type": "VariableDeclaration",
"declarations": [
{
"range": [
4,
13
],
"loc": {
"start": {
"line": 1,
"column": 4
},
"end": {
"line": 1,
"column": 13
}
},
"type": "VariableDeclarator",
"id": {
"range": [
4,
13
],
"loc": {
"start": {
"line": 1,
"column": 4
},
"end": {
"line": 1,
"column": 13
}
},
"type": "Identifier",
"name": "𞸀"
},
"init": null
}
],
"kind": "var"
}
]
}
1 change: 1 addition & 0 deletions test/fixtures/ES6/identifier/escaped_math_dal_part.js
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
var _\u{1EE03}
75 changes: 75 additions & 0 deletions test/fixtures/ES6/identifier/escaped_math_dal_part.tree.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
{
"range": [
0,
14
],
"loc": {
"start": {
"line": 1,
"column": 0
},
"end": {
"line": 1,
"column": 14
}
},
"type": "Program",
"body": [
{
"range": [
0,
14
],
"loc": {
"start": {
"line": 1,
"column": 0
},
"end": {
"line": 1,
"column": 14
}
},
"type": "VariableDeclaration",
"declarations": [
{
"range": [
4,
14
],
"loc": {
"start": {
"line": 1,
"column": 4
},
"end": {
"line": 1,
"column": 14
}
},
"type": "VariableDeclarator",
"id": {
"range": [
4,
14
],
"loc": {
"start": {
"line": 1,
"column": 4
},
"end": {
"line": 1,
"column": 14
}
},
"type": "Identifier",
"name": "_𞸃"
},
"init": null
}
],
"kind": "var"
}
]
}
1 change: 1 addition & 0 deletions test/fixtures/ES6/identifier/escaped_math_kaf_lam.js
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
var \u{1EE0A}\u{1EE0B}
Loading

0 comments on commit 014fa4c

Please sign in to comment.