diff --git a/src/addons/unicode-base.js b/src/addons/unicode-base.js index 0d2186b..95f906f 100644 --- a/src/addons/unicode-base.js +++ b/src/addons/unicode-base.js @@ -26,6 +26,7 @@ export default (XRegExp) => { // Storage for Unicode data const unicode = {}; + const unicodeTypes = {}; // Reuse utils const dec = XRegExp._dec; @@ -123,41 +124,56 @@ export default (XRegExp) => { */ XRegExp.addToken( // Use `*` instead of `+` to avoid capturing `^` as the token name in `\p{^}` - /\\([pP])(?:{(\^?)([^}]*)}|([A-Za-z]))/, + /\\([pP])(?:{(\^?)(?:(Script|sc)=)?([^}]*)}|([A-Za-z]))/, (match, scope, flags) => { const ERR_DOUBLE_NEG = 'Invalid double negation '; const ERR_UNKNOWN_NAME = 'Unknown Unicode token '; const ERR_UNKNOWN_REF = 'Unicode token missing data '; const ERR_ASTRAL_ONLY = 'Astral mode required for Unicode token '; const ERR_ASTRAL_IN_CLASS = 'Astral mode does not support Unicode tokens within character classes'; + const [ + fullToken, + pPrefix, + caretNegation, + typePrefix, + tokenName, + tokenSingleCharName + ] = match; // Negated via \P{..} or \p{^..} - let isNegated = match[1] === 'P' || !!match[2]; + let isNegated = pPrefix === 'P' || !!caretNegation; // Switch from BMP (0-FFFF) to astral (0-10FFFF) mode via flag A const isAstralMode = flags.includes('A'); - // Token lookup name. Check `[4]` first to avoid passing `undefined` via `\p{}` - let slug = normalize(match[4] || match[3]); + // Token lookup name. Check `tokenSingleCharName` first to avoid passing `undefined` + // via `\p{}` + let slug = normalize(tokenSingleCharName || tokenName); // Token data object let item = unicode[slug]; - if (match[1] === 'P' && match[2]) { - throw new SyntaxError(ERR_DOUBLE_NEG + match[0]); + if (pPrefix === 'P' && caretNegation) { + throw new SyntaxError(ERR_DOUBLE_NEG + fullToken); } if (!unicode.hasOwnProperty(slug)) { - throw new SyntaxError(ERR_UNKNOWN_NAME + match[0]); + throw new SyntaxError(ERR_UNKNOWN_NAME + fullToken); + } + + if (typePrefix) { + if (!(unicodeTypes[typePrefix] && unicodeTypes[typePrefix][slug])) { + throw new SyntaxError(ERR_UNKNOWN_NAME + fullToken); + } } // Switch to the negated form of the referenced Unicode token if (item.inverseOf) { slug = normalize(item.inverseOf); if (!unicode.hasOwnProperty(slug)) { - throw new ReferenceError(`${ERR_UNKNOWN_REF + match[0]} -> ${item.inverseOf}`); + throw new ReferenceError(`${ERR_UNKNOWN_REF + fullToken} -> ${item.inverseOf}`); } item = unicode[slug]; isNegated = !isNegated; } if (!(item.bmp || isAstralMode)) { - throw new SyntaxError(ERR_ASTRAL_ONLY + match[0]); + throw new SyntaxError(ERR_ASTRAL_ONLY + fullToken); } if (isAstralMode) { if (scope === 'class') { @@ -196,6 +212,9 @@ export default (XRegExp) => { * character classes and alternation, and should use surrogate pairs to represent astral code * points. `inverseOf` can be used to avoid duplicating character data if a Unicode token is * defined as the exact inverse of another token. + * @param {String} [typePrefix] Enables optionally using this type as a prefix for all of the + * provided Unicode tokens, e.g. if given `'Type'`, then `\p{TokenName}` can also be written + * as `\p{Type=TokenName}`. * @example * * // Basic use @@ -206,10 +225,15 @@ export default (XRegExp) => { * }]); * XRegExp('\\p{XDigit}:\\p{Hexadecimal}+').test('0:3D'); // -> true */ - XRegExp.addUnicodeData = (data) => { + XRegExp.addUnicodeData = (data, typePrefix) => { const ERR_NO_NAME = 'Unicode token requires name'; const ERR_NO_DATA = 'Unicode token has no character data '; + if (typePrefix) { + // Case sensitive to match ES2018 + unicodeTypes[typePrefix] = {}; + } + for (const item of data) { if (!item.name) { throw new Error(ERR_NO_NAME); @@ -217,9 +241,19 @@ export default (XRegExp) => { if (!(item.inverseOf || item.bmp || item.astral)) { throw new Error(ERR_NO_DATA + item.name); } - unicode[normalize(item.name)] = item; + + const normalizedName = normalize(item.name); + unicode[normalizedName] = item; + if (typePrefix) { + unicodeTypes[typePrefix][normalizedName] = true; + } + if (item.alias) { - unicode[normalize(item.alias)] = item; + const normalizedAlias = normalize(item.alias); + unicode[normalizedAlias] = item; + if (typePrefix) { + unicodeTypes[typePrefix][normalizedAlias] = true; + } } } diff --git a/src/addons/unicode-scripts.js b/src/addons/unicode-scripts.js index 001e612..3ac0837 100644 --- a/src/addons/unicode-scripts.js +++ b/src/addons/unicode-scripts.js @@ -22,5 +22,5 @@ export default (XRegExp) => { throw new ReferenceError('Unicode Base must be loaded before Unicode Scripts'); } - XRegExp.addUnicodeData(scripts); + XRegExp.addUnicodeData(scripts, 'Script'); }; diff --git a/tests/spec/s-addons-unicode.js b/tests/spec/s-addons-unicode.js index f6bb0ae..413ff70 100644 --- a/tests/spec/s-addons-unicode.js +++ b/tests/spec/s-addons-unicode.js @@ -417,6 +417,10 @@ describe('Unicode Categories addon:', function() { expect(function() {XRegExp('\\p{IsP}');}).toThrowError(SyntaxError); }); + it('should not allow the "Script=" prefix for category names', function() { + expect(function() {XRegExp('\\p{Script=P}');}).toThrowError(SyntaxError); + }); + it('should handle \\p{Cn}', function() { testUnicodeToken('Cn', { invalid: ['\u20BA'] @@ -489,6 +493,10 @@ describe('Unicode Properties addon:', function() { expect(function() {XRegExp('\\p{IsASCII}');}).toThrowError(SyntaxError); }); + it('should not allow the "Script=" prefix for property names', function() { + expect(function() {XRegExp('\\p{Script=ASCII}');}).toThrowError(SyntaxError); + }); + it('should handle \\p{Alphabetic}', function() { testUnicodeToken('Alphabetic', { valid: ['A', 'a', 'Å', 'å', '日', 'ي'], @@ -529,6 +537,21 @@ describe('Unicode Scripts addon:', function() { expect(function() {XRegExp('\\p{IsLatin}');}).toThrowError(SyntaxError); }); + it('should allow the "Script=" prefix for script names', function() { + expect(function() {XRegExp('\\p{Script=Latin}');}).not.toThrow(); + testUnicodeToken('Script=Latin', { + valid: ['A', 'B', 'C'], + invalid: ['カ', 'タ', 'ナ'] + }); + }); + + it('should handle \\p{Latin}', function() { + testUnicodeToken('Latin', { + valid: ['A', 'B', 'C'], + invalid: ['カ', 'タ', 'ナ'] + }); + }); + it('should handle \\p{Katakana}', function() { testUnicodeToken('Katakana', { valid: ['カ', 'タ', 'ナ'], diff --git a/types/index.d.ts b/types/index.d.ts index 99f8bd6..6cc9e83 100644 --- a/types/index.d.ts +++ b/types/index.d.ts @@ -497,6 +497,9 @@ declare namespace XRegExp { * character classes and alternation, and should use surrogate pairs to represent astral code * points. `inverseOf` can be used to avoid duplicating character data if a Unicode token is * defined as the exact inverse of another token. + * @param typePrefix - Enables optionally using this type as a prefix for all of the + * provided Unicode tokens, e.g. if given `'Type'`, then `\p{TokenName}` can also be written + * as `\p{Type=TokenName}`. * @example * * // Basic use @@ -507,7 +510,7 @@ declare namespace XRegExp { * }]); * XRegExp('\\p{XDigit}:\\p{Hexadecimal}+').test('0:3D'); // -> true */ - function addUnicodeData(data: UnicodeCharacterRange[]): void; + function addUnicodeData(data: UnicodeCharacterRange[], typePrefix?: string): void; /** * Builds regexes using named subpatterns, for readability and pattern reuse. Backreferences in