From e1237f7e441612e8da71c972aa80baf061a392fb Mon Sep 17 00:00:00 2001 From: Nick Frasser Date: Mon, 11 Oct 2021 11:10:03 -0400 Subject: [PATCH] Scanner token parsing refactor (#353) * Refactor scanner to emit more granular tokens The DOMAIN token (among others) has been removed in favour of WORD, UWORD and other tokens. SCHEME (formerly PROTOCOL) tokens now come in several flavours. Includes additional facilities for token groups, which will be useful for future plugins down the line Also fixes file URL behaviour * Update plugins to use new scanner tokens Also improves hashtag and mention plugin accuracy * Update tests to work with new scanner * Better workspace packages to improve build order * Update benchmark require * Remove resolved FIXME * Additional tests for linkify register functions Fixes #171 Fixes #245 Fixes #351 --- package.json | 2 + packages/linkifyjs/src/core/fsm.js | 38 ++- packages/linkifyjs/src/core/parser.js | 351 ++++++++++---------- packages/linkifyjs/src/core/scanner.js | 207 ++++++------ packages/linkifyjs/src/core/tlds.js | 15 +- packages/linkifyjs/src/core/tokens/multi.js | 47 +-- packages/linkifyjs/src/core/tokens/text.js | 60 +++- packages/linkifyjs/src/linkify.js | 22 +- packages/linkifyjs/src/plugins/hashtag.js | 36 +- packages/linkifyjs/src/plugins/mention.js | 58 +--- packages/linkifyjs/src/plugins/ticket.js | 10 +- test/benchmarks.js | 6 +- test/spec/core/parser.test.js | 25 +- test/spec/core/scanner.test.js | 117 ++++--- test/spec/core/tokens/multi.test.js | 36 +- test/spec/linkifyjs.test.js | 97 +++++- test/spec/plugins/hashtag.test.js | 49 ++- test/spec/plugins/mention.test.js | 73 ++-- 18 files changed, 714 insertions(+), 535 deletions(-) diff --git a/package.json b/package.json index c5315a98..de346bcf 100644 --- a/package.json +++ b/package.json @@ -61,6 +61,8 @@ "node": ">=8" }, "workspaces": [ + "./packages/linkifyjs", + "./packages/linkify-plugin-*/", "./packages/*" ] } diff --git a/packages/linkifyjs/src/core/fsm.js b/packages/linkifyjs/src/core/fsm.js index 72d465a8..9d30b731 100644 --- a/packages/linkifyjs/src/core/fsm.js +++ b/packages/linkifyjs/src/core/fsm.js @@ -10,6 +10,7 @@ * @param {string|class} token to emit */ export function State(token) { + // this.n = null; // DEBUG: State name this.j = {}; // IMPLEMENTATION 1 // this.j = []; // IMPLEMENTATION 2 this.jr = []; @@ -49,11 +50,21 @@ State.prototype = { * transitioned to on the given input regardless of what that input * previously did. * - * @param {string} input character or token to transition on + * @param {string} input character or token type to transition on * @param {Token|State} tokenOrState transition to a matching state * @returns State taken after the given input */ tt(input, tokenOrState) { + if (input instanceof Array) { + // Recursive case + if (input.length === 0) { return; } + const nextState = this.tt(input[0], tokenOrState); + for (let i = 1; i < input.length; i++) { + this.tt(input[i], nextState); + } + return nextState; + } + if (tokenOrState && tokenOrState.j) { // State, default a basic transition this.j[input] = tokenOrState; @@ -92,13 +103,21 @@ State.prototype = { * Utility function to create state without using new keyword (reduced file size * when minified) */ -export const makeState = () => new State(); +export const makeState = (/*name*/) => { + const s = new State(); + // if (name) { s.n = name; } // DEBUG + return s; +}; /** * Similar to previous except it is an accepting state that emits a token * @param {Token} token */ -export const makeAcceptingState = (token) => new State(token); +export const makeAcceptingState = (token/*, name*/) => { + const s = new State(token); + // if (name) { s.n = name; } // DEBUG + return s; +}; /** * Create a transition from startState to nextState via the given character @@ -112,6 +131,7 @@ export const makeT = (startState, input, nextState) => { // IMPLEMENTATION 2: Add to array (slower) // startState.j.push([input, nextState]); + return startState.j[input]; }; /** @@ -127,7 +147,7 @@ export const makeRegexT = (startState, regex, nextState) => { /** * Follow the transition from the given character to the next state * @param {State} state - * @param {Token} input character or other concrete token type to transition + * @param {string|Token} input character or other concrete token type to transition * @returns {?State} the next state, if any */ export const takeT = (state, input) => { @@ -145,8 +165,8 @@ export const takeT = (state, input) => { for (let i = 0; i < state.jr.length; i++) { const regex = state.jr[i][0]; - const nextState = state.jr[i][1]; - if (regex.test(input)) {return nextState;} + const nextState = state.jr[i][1]; // note: might be empty to prevent default jump + if (nextState && regex.test(input)) { return nextState; } } // Nowhere left to jump! Return default, if any return state.jd; @@ -176,6 +196,7 @@ export const makeBatchT = (startState, transitions) => { for (let i = 0; i < transitions.length; i++) { const input = transitions[i][0]; const nextState = transitions[i][1]; + // if (!nextState.n && typeof input === 'string') { nextState.n = input; } // DEBUG makeT(startState, input, nextState); } }; @@ -193,6 +214,7 @@ export const makeBatchT = (startState, transitions) => { * @param {string} str * @param {Token} endStateFactory * @param {Token} defaultStateFactory + * @return {State} the final state */ export const makeChainT = (state, str, endState, defaultStateFactory) => { let i = 0, len = str.length, nextState; @@ -203,7 +225,7 @@ export const makeChainT = (state, str, endState, defaultStateFactory) => { i++; } - if (i >= len) { return []; } // no new tokens were added + if (i >= len) { return state; } // no new tokens were added while (i < len - 1) { nextState = defaultStateFactory(); @@ -213,4 +235,6 @@ export const makeChainT = (state, str, endState, defaultStateFactory) => { } makeT(state, str[len - 1], endState); + // if (!endState.n) { endState.n === str; } // DEBUG + return endState; }; diff --git a/packages/linkifyjs/src/core/parser.js b/packages/linkifyjs/src/core/parser.js index ef3de40b..4fd2c2fc 100644 --- a/packages/linkifyjs/src/core/parser.js +++ b/packages/linkifyjs/src/core/parser.js @@ -5,7 +5,7 @@ overkill). URL format: http://en.wikipedia.org/wiki/URI_scheme - Email format: http://en.wikipedia.org/wiki/Email_address (links to RFC in + Email format: http://en.wikipedia.org/wiki/EmailAddress (links to RFC in reference) @module linkify @@ -29,99 +29,99 @@ import * as mtk from './tokens/multi'; */ export function init() { // The universal starting state. - let S_START = makeState(); + const Start = makeState(); // Intermediate states for URLs. Note that domains that begin with a protocol // are treated slighly differently from those that don't. - let S_PROTOCOL = makeState(); // e.g., 'http:' - let S_MAILTO = makeState(); // 'mailto:' - let S_PROTOCOL_SLASH = makeState(); // e.g., 'http:/'' - let S_PROTOCOL_SLASH_SLASH = makeState(); // e.g.,'http://' - let S_DOMAIN = makeState(); // parsed string ends with a potential domain name (A) - let S_DOMAIN_DOT = makeState(); // (A) domain followed by DOT - let S_TLD = makeAcceptingState(mtk.Url); // (A) Simplest possible URL with no query string - let S_TLD_COLON = makeState(); // (A) URL followed by colon (potential port number here) - let S_TLD_PORT = makeAcceptingState(mtk.Url); // TLD followed by a port number - let S_URL = makeAcceptingState(mtk.Url); // Long URL with optional port and maybe query string - let S_URL_NON_ACCEPTING = makeState(); // URL followed by some symbols (will not be part of the final URL) - let S_URL_OPENBRACE = makeState(); // URL followed by { - let S_URL_OPENBRACKET = makeState(); // URL followed by [ - let S_URL_OPENANGLEBRACKET = makeState(); // URL followed by < - let S_URL_OPENPAREN = makeState(); // URL followed by ( - let S_URL_OPENBRACE_Q = makeAcceptingState(mtk.Url); // URL followed by { and some symbols that the URL can end it - let S_URL_OPENBRACKET_Q = makeAcceptingState(mtk.Url); // URL followed by [ and some symbols that the URL can end it - let S_URL_OPENANGLEBRACKET_Q = makeAcceptingState(mtk.Url); // URL followed by < and some symbols that the URL can end it - let S_URL_OPENPAREN_Q = makeAcceptingState(mtk.Url); // URL followed by ( and some symbols that the URL can end it - let S_URL_OPENBRACE_SYMS = makeState(); // S_URL_OPENBRACE_Q followed by some symbols it cannot end it - let S_URL_OPENBRACKET_SYMS = makeState(); // S_URL_OPENBRACKET_Q followed by some symbols it cannot end it - let S_URL_OPENANGLEBRACKET_SYMS = makeState(); // S_URL_OPENANGLEBRACKET_Q followed by some symbols it cannot end it - let S_URL_OPENPAREN_SYMS = makeState(); // S_URL_OPENPAREN_Q followed by some symbols it cannot end it - let S_EMAIL_DOMAIN = makeState(); // parsed string starts with local email info + @ with a potential domain name (C) - let S_EMAIL_DOMAIN_DOT = makeState(); // (C) domain followed by DOT - let S_EMAIL = makeAcceptingState(mtk.Email); // (C) Possible email address (could have more tlds) - let S_EMAIL_COLON = makeState(); // (C) URL followed by colon (potential port number here) - let S_EMAIL_PORT = makeAcceptingState(mtk.Email); // (C) Email address with a port - let S_MAILTO_EMAIL = makeAcceptingState(mtk.MailtoEmail); // Email that begins with the mailto prefix (D) - let S_MAILTO_EMAIL_NON_ACCEPTING = makeState(); // (D) Followed by some non-query string chars - let S_LOCALPART = makeState(); // Local part of the email address - let S_LOCALPART_AT = makeState(); // Local part of the email address plus @ - let S_LOCALPART_DOT = makeState(); // Local part of the email address plus '.' (localpart cannot end in .) - let S_NL = makeAcceptingState(mtk.Nl); // single new line + const Scheme = makeState(); // e.g., 'mailto' + const SlashScheme = makeState(); // e.g., 'http' + const SlashSchemeColon = makeState(); // e.g., 'http:' + const SlashSchemeColonSlash = makeState(); // e.g., 'http:/' + const UriPrefix = makeState(); // e.g., 'mailto:' or 'http://' + + const Domain = makeState(); // parsed string ends with a potential domain name (A) + const DomainDot = makeState(); // domain followed by DOT + const DomainHyphen = makeState(); // domain followed by hyphen + const DomainDotTld = makeAcceptingState(mtk.Url); // Simplest possible URL with no query string + const DomainDotTldColon = makeState(); // URL followed by colon (potential port number here) + const DomainDotTldColonPort = makeAcceptingState(mtk.Url); // TLD followed by a port number + + const Url = makeAcceptingState(mtk.Url); // Long URL with optional port and maybe query string + const UrlNonaccept = makeState(); // URL followed by some symbols (will not be part of the final URL) + const UrlOpenbrace = makeState(); // URL followed by { + const UrlOpenbracket = makeState(); // URL followed by [ + const UrlOpenanglebracket = makeState(); // URL followed by < + const UrlOpenparen = makeState(); // URL followed by ( + const UrlOpenbraceQ = makeAcceptingState(mtk.Url); // URL followed by { and some symbols that the URL can end it + const UrlOpenbracketQ = makeAcceptingState(mtk.Url); // URL followed by [ and some symbols that the URL can end it + const UrlOpenanglebracketQ = makeAcceptingState(mtk.Url); // URL followed by < and some symbols that the URL can end it + const UrlOpenparenQ = makeAcceptingState(mtk.Url); // URL followed by ( and some symbols that the URL can end it + const UrlOpenbraceSyms = makeState(); // UrlOpenbrace followed by some symbols it cannot end it + const UrlOpenbracketSyms = makeState(); // UrlOpenbracketQ followed by some symbols it cannot end it + const UrlOpenanglebracketSyms = makeState(); // UrlOpenanglebracketQ followed by some symbols it cannot end it + const UrlOpenparenSyms = makeState(); // UrlOpenparenQ followed by some symbols it cannot end it + + const EmailDomain = makeState(); // parsed string starts with local email info + @ with a potential domain name + const EmailDomainDot = makeState(); // domain followed by DOT + const EmailDomainHyphen = makeState(); // parsed string starts with local email info + @ with a potential domain name + const Email = makeAcceptingState(mtk.Email); // Possible email address (could have more tlds) + const EmailColon = makeState(); // URL followed by colon (potential port number here) + const EmailColonPort = makeAcceptingState(mtk.Email); // URL followed by colon and potential port numner + const Localpart = makeState(); // Local part of the email address + const LocalpartAt = makeState(); // Local part of the email address plus @ + const LocalpartAtNum = makeState(); // Local part of the email address plus @ plus a number + const LocalpartDot = makeState(); // Local part of the email address plus '.' (localpart cannot end in .) + + const Nl = makeAcceptingState(mtk.Nl); // single new line // Make path from start to protocol (with '//') - makeT(S_START, tk.NL, S_NL); - makeT(S_START, tk.PROTOCOL, S_PROTOCOL); - makeT(S_START, tk.MAILTO, S_MAILTO); - - makeT(S_PROTOCOL, tk.SLASH, S_PROTOCOL_SLASH); - makeT(S_PROTOCOL_SLASH, tk.SLASH, S_PROTOCOL_SLASH_SLASH); - - // The very first potential domain name - makeT(S_START, tk.TLD, S_DOMAIN); - makeT(S_START, tk.DOMAIN, S_DOMAIN); - makeT(S_START, tk.LOCALHOST, S_TLD); - makeT(S_START, tk.NUM, S_DOMAIN); - - // Force URL for protocol followed by anything sane - makeT(S_PROTOCOL_SLASH_SLASH, tk.TLD, S_URL); - makeT(S_PROTOCOL_SLASH_SLASH, tk.DOMAIN, S_URL); - makeT(S_PROTOCOL_SLASH_SLASH, tk.NUM, S_URL); - makeT(S_PROTOCOL_SLASH_SLASH, tk.LOCALHOST, S_URL); - - // Account for dots and hyphens - // hyphens are usually parts of domain names - makeT(S_DOMAIN, tk.DOT, S_DOMAIN_DOT); - makeT(S_EMAIL_DOMAIN, tk.DOT, S_EMAIL_DOMAIN_DOT); - + makeT(Start, tk.NL, Nl); + makeT(Start, tk.SCHEME, Scheme); + makeT(Start, tk.SLASH_SCHEME, SlashScheme); + makeT(Start, tk.COMPOUND_SCHEME, Scheme); + makeT(Start, tk.COMPOUND_SLASH_SCHEME, SlashScheme); + + // Most transitions after a UriPrefix will be considered URL tokens + makeT(Scheme, tk.COLON, UriPrefix); + makeT(SlashScheme, tk.COLON, SlashSchemeColon); + makeT(SlashSchemeColon, tk.SLASH, SlashSchemeColonSlash); + makeT(SlashSchemeColonSlash, tk.SLASH, UriPrefix); + + // The very first potential domain name + full URL + makeT(Start, tk.LOCALHOST, DomainDotTld); + + // Some transitions from this call are ignored because they're already + // accounted for in the scheme state definitions above + makeMultiT(Start, tk.domain, Domain); + + + // Account for dots and hyphens. Hyphens are usually parts of domain names + // (but not TLDs) + makeT(Domain, tk.DOT, DomainDot); + makeT(Domain, tk.HYPHEN, DomainHyphen); + makeMultiT(Domain, tk.domain, Domain); + makeT(DomainDot, tk.TLD, DomainDotTld); + makeT(DomainDot, tk.UTLD, DomainDotTld); + makeMultiT(DomainDot, tk.domain, Domain); // Hyphen can jump back to a domain name - - // After the first domain and a dot, we can find either a URL or another domain - makeT(S_DOMAIN_DOT, tk.TLD, S_TLD); - makeT(S_DOMAIN_DOT, tk.DOMAIN, S_DOMAIN); - makeT(S_DOMAIN_DOT, tk.NUM, S_DOMAIN); - makeT(S_DOMAIN_DOT, tk.LOCALHOST, S_DOMAIN); - - makeT(S_EMAIL_DOMAIN_DOT, tk.TLD, S_EMAIL); - makeT(S_EMAIL_DOMAIN_DOT, tk.DOMAIN, S_EMAIL_DOMAIN); - makeT(S_EMAIL_DOMAIN_DOT, tk.NUM, S_EMAIL_DOMAIN); - makeT(S_EMAIL_DOMAIN_DOT, tk.LOCALHOST, S_EMAIL_DOMAIN); - - // S_TLD accepts! But the URL could be longer, try to find a match greedily - // The `run` function should be able to "rollback" to the accepting state - makeT(S_TLD, tk.DOT, S_DOMAIN_DOT); - makeT(S_EMAIL, tk.DOT, S_EMAIL_DOMAIN_DOT); + makeMultiT(DomainHyphen, tk.domain, Domain); + makeT(DomainDotTld, tk.DOT, DomainDot); + makeT(DomainDotTld, tk.HYPHEN, DomainHyphen); + makeMultiT(DomainDotTld, tk.domain, Domain); // Become real URLs after `SLASH` or `COLON NUM SLASH` - // Here PSS and non-PSS converge - makeT(S_TLD, tk.COLON, S_TLD_COLON); - makeT(S_TLD, tk.SLASH, S_URL); - makeT(S_TLD_COLON, tk.NUM, S_TLD_PORT); - makeT(S_TLD_PORT, tk.SLASH, S_URL); - makeT(S_EMAIL, tk.COLON, S_EMAIL_COLON); - makeT(S_EMAIL_COLON, tk.NUM, S_EMAIL_PORT); + // Here works with or without scheme:// prefix + makeT(DomainDotTld, tk.COLON, DomainDotTldColon); + makeT(DomainDotTld, tk.SLASH, Url); + makeMultiT(DomainDotTldColon, tk.numeric, DomainDotTldColonPort); + makeT(DomainDotTldColonPort, tk.SLASH, Url); + + // Force URL with scheme prefix followed by anything sane + makeT(UriPrefix, tk.SLASH, Url); + makeMultiT(UriPrefix, tk.domain, Url); // Types of characters the URL can definitely end in - const qsAccepting = [ + const qsAccepting = tk.domain.concat([ tk.AMPERSAND, tk.ASTERISK, tk.AT, @@ -129,22 +129,18 @@ export function init() { tk.BACKTICK, tk.CARET, tk.DOLLAR, - tk.DOMAIN, tk.EQUALS, tk.HYPHEN, - tk.LOCALHOST, tk.NUM, tk.PERCENT, tk.PIPE, tk.PLUS, tk.POUND, - tk.PROTOCOL, tk.SLASH, tk.SYM, tk.TILDE, - tk.TLD, tk.UNDERSCORE - ]; + ]); // Types of tokens that can follow a URL and be part of the query string // but cannot be the very last characters @@ -172,86 +168,73 @@ export function init() { // include the final round bracket. // URL, followed by an opening bracket - makeT(S_URL, tk.OPENBRACE, S_URL_OPENBRACE); - makeT(S_URL, tk.OPENBRACKET, S_URL_OPENBRACKET); - makeT(S_URL, tk.OPENANGLEBRACKET, S_URL_OPENANGLEBRACKET); - makeT(S_URL, tk.OPENPAREN, S_URL_OPENPAREN); + makeT(Url, tk.OPENBRACE, UrlOpenbrace); + makeT(Url, tk.OPENBRACKET, UrlOpenbracket); + makeT(Url, tk.OPENANGLEBRACKET, UrlOpenanglebracket); + makeT(Url, tk.OPENPAREN, UrlOpenparen); // URL with extra symbols at the end, followed by an opening bracket - makeT(S_URL_NON_ACCEPTING, tk.OPENBRACE, S_URL_OPENBRACE); - makeT(S_URL_NON_ACCEPTING, tk.OPENBRACKET, S_URL_OPENBRACKET); - makeT(S_URL_NON_ACCEPTING, tk.OPENANGLEBRACKET, S_URL_OPENANGLEBRACKET); - makeT(S_URL_NON_ACCEPTING, tk.OPENPAREN, S_URL_OPENPAREN); + makeT(UrlNonaccept, tk.OPENBRACE, UrlOpenbrace); + makeT(UrlNonaccept, tk.OPENBRACKET, UrlOpenbracket); + makeT(UrlNonaccept, tk.OPENANGLEBRACKET, UrlOpenanglebracket); + makeT(UrlNonaccept, tk.OPENPAREN, UrlOpenparen); // Closing bracket component. This character WILL be included in the URL - makeT(S_URL_OPENBRACE, tk.CLOSEBRACE, S_URL); - makeT(S_URL_OPENBRACKET, tk.CLOSEBRACKET, S_URL); - makeT(S_URL_OPENANGLEBRACKET, tk.CLOSEANGLEBRACKET, S_URL); - makeT(S_URL_OPENPAREN, tk.CLOSEPAREN, S_URL); - makeT(S_URL_OPENBRACE_Q, tk.CLOSEBRACE, S_URL); - makeT(S_URL_OPENBRACKET_Q, tk.CLOSEBRACKET, S_URL); - makeT(S_URL_OPENANGLEBRACKET_Q, tk.CLOSEANGLEBRACKET, S_URL); - makeT(S_URL_OPENPAREN_Q, tk.CLOSEPAREN, S_URL); - makeT(S_URL_OPENBRACE_SYMS, tk.CLOSEBRACE, S_URL); - makeT(S_URL_OPENBRACKET_SYMS, tk.CLOSEBRACKET, S_URL); - makeT(S_URL_OPENANGLEBRACKET_SYMS, tk.CLOSEANGLEBRACKET, S_URL); - makeT(S_URL_OPENPAREN_SYMS, tk.CLOSEPAREN, S_URL); + makeT(UrlOpenbrace, tk.CLOSEBRACE, Url); + makeT(UrlOpenbracket, tk.CLOSEBRACKET, Url); + makeT(UrlOpenanglebracket, tk.CLOSEANGLEBRACKET, Url); + makeT(UrlOpenparen, tk.CLOSEPAREN, Url); + makeT(UrlOpenbrace, tk.CLOSEBRACE, Url); + makeT(UrlOpenbracketQ, tk.CLOSEBRACKET, Url); + makeT(UrlOpenanglebracketQ, tk.CLOSEANGLEBRACKET, Url); + makeT(UrlOpenparenQ, tk.CLOSEPAREN, Url); + makeT(UrlOpenbrace, tk.CLOSEBRACE, Url); + makeT(UrlOpenbracketSyms, tk.CLOSEBRACKET, Url); + makeT(UrlOpenanglebracketSyms, tk.CLOSEANGLEBRACKET, Url); + makeT(UrlOpenparenSyms, tk.CLOSEPAREN, Url); // URL that beings with an opening bracket, followed by a symbols. - // Note that the final state can still be `S_URL_OPENBRACE_Q` (if the URL only + // Note that the final state can still be `UrlOpenbrace` (if the URL only // has a single opening bracket for some reason). - makeMultiT(S_URL_OPENBRACE, qsAccepting, S_URL_OPENBRACE_Q); - makeMultiT(S_URL_OPENBRACKET, qsAccepting, S_URL_OPENBRACKET_Q); - makeMultiT(S_URL_OPENANGLEBRACKET, qsAccepting, S_URL_OPENANGLEBRACKET_Q); - makeMultiT(S_URL_OPENPAREN, qsAccepting, S_URL_OPENPAREN_Q); - makeMultiT(S_URL_OPENBRACE, qsNonAccepting, S_URL_OPENBRACE_SYMS); - makeMultiT(S_URL_OPENBRACKET, qsNonAccepting, S_URL_OPENBRACKET_SYMS); - makeMultiT(S_URL_OPENANGLEBRACKET, qsNonAccepting, S_URL_OPENANGLEBRACKET_SYMS); - makeMultiT(S_URL_OPENPAREN, qsNonAccepting, S_URL_OPENPAREN_SYMS); + makeMultiT(UrlOpenbrace, qsAccepting, UrlOpenbrace); + makeMultiT(UrlOpenbracket, qsAccepting, UrlOpenbracketQ); + makeMultiT(UrlOpenanglebracket, qsAccepting, UrlOpenanglebracketQ); + makeMultiT(UrlOpenparen, qsAccepting, UrlOpenparenQ); + makeMultiT(UrlOpenbrace, qsNonAccepting, UrlOpenbrace); + makeMultiT(UrlOpenbracket, qsNonAccepting, UrlOpenbracketSyms); + makeMultiT(UrlOpenanglebracket, qsNonAccepting, UrlOpenanglebracketSyms); + makeMultiT(UrlOpenparen, qsNonAccepting, UrlOpenparenSyms); // URL that begins with an opening bracket, followed by some symbols - makeMultiT(S_URL_OPENBRACE_Q, qsAccepting, S_URL_OPENBRACE_Q); - makeMultiT(S_URL_OPENBRACKET_Q, qsAccepting, S_URL_OPENBRACKET_Q); - makeMultiT(S_URL_OPENANGLEBRACKET_Q, qsAccepting, S_URL_OPENANGLEBRACKET_Q); - makeMultiT(S_URL_OPENPAREN_Q, qsAccepting, S_URL_OPENPAREN_Q); - makeMultiT(S_URL_OPENBRACE_Q, qsNonAccepting, S_URL_OPENBRACE_Q); - makeMultiT(S_URL_OPENBRACKET_Q, qsNonAccepting, S_URL_OPENBRACKET_Q); - makeMultiT(S_URL_OPENANGLEBRACKET_Q, qsNonAccepting, S_URL_OPENANGLEBRACKET_Q); - makeMultiT(S_URL_OPENPAREN_Q, qsNonAccepting, S_URL_OPENPAREN_Q); - - makeMultiT(S_URL_OPENBRACE_SYMS, qsAccepting, S_URL_OPENBRACE_Q); - makeMultiT(S_URL_OPENBRACKET_SYMS, qsAccepting, S_URL_OPENBRACKET_Q); - makeMultiT(S_URL_OPENANGLEBRACKET_SYMS, qsAccepting, S_URL_OPENANGLEBRACKET_Q); - makeMultiT(S_URL_OPENPAREN_SYMS, qsAccepting, S_URL_OPENPAREN_Q); - makeMultiT(S_URL_OPENBRACE_SYMS, qsNonAccepting, S_URL_OPENBRACE_SYMS); - makeMultiT(S_URL_OPENBRACKET_SYMS, qsNonAccepting, S_URL_OPENBRACKET_SYMS); - makeMultiT(S_URL_OPENANGLEBRACKET_SYMS, qsNonAccepting, S_URL_OPENANGLEBRACKET_SYMS); - makeMultiT(S_URL_OPENPAREN_SYMS, qsNonAccepting, S_URL_OPENPAREN_SYMS); + makeMultiT(UrlOpenbraceQ, qsAccepting, UrlOpenbraceQ); + makeMultiT(UrlOpenbracketQ, qsAccepting, UrlOpenbracketQ); + makeMultiT(UrlOpenanglebracketQ, qsAccepting, UrlOpenanglebracketQ); + makeMultiT(UrlOpenparenQ, qsAccepting, UrlOpenparenQ); + makeMultiT(UrlOpenbraceQ, qsNonAccepting, UrlOpenbraceQ); + makeMultiT(UrlOpenbracketQ, qsNonAccepting, UrlOpenbracketQ); + makeMultiT(UrlOpenanglebracketQ, qsNonAccepting, UrlOpenanglebracketQ); + makeMultiT(UrlOpenparenQ, qsNonAccepting, UrlOpenparenQ); + + makeMultiT(UrlOpenbraceSyms, qsAccepting, UrlOpenbraceSyms); + makeMultiT(UrlOpenbracketSyms, qsAccepting, UrlOpenbracketQ); + makeMultiT(UrlOpenanglebracketSyms, qsAccepting, UrlOpenanglebracketQ); + makeMultiT(UrlOpenparenSyms, qsAccepting, UrlOpenparenQ); + makeMultiT(UrlOpenbraceSyms, qsNonAccepting, UrlOpenbraceSyms); + makeMultiT(UrlOpenbracketSyms, qsNonAccepting, UrlOpenbracketSyms); + makeMultiT(UrlOpenanglebracketSyms, qsNonAccepting, UrlOpenanglebracketSyms); + makeMultiT(UrlOpenparenSyms, qsNonAccepting, UrlOpenparenSyms); // Account for the query string - makeMultiT(S_URL, qsAccepting, S_URL); - makeMultiT(S_URL_NON_ACCEPTING, qsAccepting, S_URL); + makeMultiT(Url, qsAccepting, Url); + makeMultiT(UrlNonaccept, qsAccepting, Url); - makeMultiT(S_URL, qsNonAccepting, S_URL_NON_ACCEPTING); - makeMultiT(S_URL_NON_ACCEPTING, qsNonAccepting, S_URL_NON_ACCEPTING); + makeMultiT(Url, qsNonAccepting, UrlNonaccept); + makeMultiT(UrlNonaccept, qsNonAccepting, UrlNonaccept); // Email address-specific state definitions // Note: We are not allowing '/' in email addresses since this would interfere // with real URLs - // For addresses with the mailto prefix - // 'mailto:' followed by anything sane is a valid email - makeT(S_MAILTO, tk.TLD, S_MAILTO_EMAIL); - makeT(S_MAILTO, tk.DOMAIN, S_MAILTO_EMAIL); - makeT(S_MAILTO, tk.NUM, S_MAILTO_EMAIL); - makeT(S_MAILTO, tk.LOCALHOST, S_MAILTO_EMAIL); - - // Greedily get more potential valid email values - makeMultiT(S_MAILTO_EMAIL, qsAccepting, S_MAILTO_EMAIL); - makeMultiT(S_MAILTO_EMAIL, qsNonAccepting, S_MAILTO_EMAIL_NON_ACCEPTING); - makeMultiT(S_MAILTO_EMAIL_NON_ACCEPTING, qsAccepting, S_MAILTO_EMAIL); - makeMultiT(S_MAILTO_EMAIL_NON_ACCEPTING, qsNonAccepting, S_MAILTO_EMAIL_NON_ACCEPTING); - // For addresses without the mailto prefix // Tokens allowed in the localpart of the email const localpartAccepting = [ @@ -263,7 +246,6 @@ export function init() { tk.CARET, tk.CLOSEBRACE, tk.DOLLAR, - tk.DOMAIN, tk.EQUALS, tk.HYPHEN, tk.NUM, @@ -276,33 +258,47 @@ export function init() { tk.SLASH, tk.SYM, tk.TILDE, - tk.TLD, tk.UNDERSCORE ]; // Some of the tokens in `localpartAccepting` are already accounted for here and - // will not be overwritten (don't worry) - makeMultiT(S_DOMAIN, localpartAccepting, S_LOCALPART); - makeT(S_DOMAIN, tk.AT, S_LOCALPART_AT); - makeMultiT(S_TLD, localpartAccepting, S_LOCALPART); - makeT(S_TLD, tk.AT, S_LOCALPART_AT); - makeMultiT(S_DOMAIN_DOT, localpartAccepting, S_LOCALPART); + // will not be overwritten + makeMultiT(Domain, localpartAccepting, Localpart); + makeT(Domain, tk.AT, LocalpartAt); + makeMultiT(DomainDotTld, localpartAccepting, Localpart); + makeT(DomainDotTld, tk.AT, LocalpartAt); + makeMultiT(DomainDot, localpartAccepting, Localpart); // Now in localpart of address - // TODO: IP addresses and what if the email starts with numbers? - makeMultiT(S_LOCALPART, localpartAccepting, S_LOCALPART); - makeT(S_LOCALPART, tk.AT, S_LOCALPART_AT); // close to an email address now - makeT(S_LOCALPART, tk.DOT, S_LOCALPART_DOT); - makeMultiT(S_LOCALPART_DOT, localpartAccepting, S_LOCALPART); - makeT(S_LOCALPART_AT, tk.TLD, S_EMAIL_DOMAIN); - makeT(S_LOCALPART_AT, tk.DOMAIN, S_EMAIL_DOMAIN); - makeT(S_LOCALPART_AT, tk.NUM, S_EMAIL_DOMAIN); - makeT(S_LOCALPART_AT, tk.LOCALHOST, S_EMAIL); + makeMultiT(Localpart, tk.domain, Localpart); + makeMultiT(Localpart, localpartAccepting, Localpart); + makeT(Localpart, tk.AT, LocalpartAt); // close to an email address now + makeT(Localpart, tk.DOT, LocalpartDot); + makeMultiT(LocalpartDot, tk.domain, Localpart); + makeMultiT(LocalpartDot, localpartAccepting, Localpart); + makeT(LocalpartAt, tk.LOCALHOST, Email); + makeMultiT(LocalpartAt, tk.domain, EmailDomain); + makeMultiT(LocalpartAtNum, tk.domain, EmailDomain); + + makeT(EmailDomain, tk.DOT, EmailDomainDot); + makeT(EmailDomain, tk.HYPHEN, EmailDomainHyphen); + makeT(EmailDomainDot, tk.TLD, Email); + makeT(EmailDomainDot, tk.UTLD, Email); + makeMultiT(EmailDomainDot, tk.domain, EmailDomain); + + // Hyphen can jump back to a domain name + makeMultiT(EmailDomainHyphen, tk.domain, EmailDomain); + makeT(Email, tk.DOT, EmailDomainDot); + makeT(Email, tk.HYPHEN, EmailDomainHyphen); + makeMultiT(Email, tk.domain, EmailDomain); - // States following `@` defined above + // Become real URLs after `SLASH` or `COLON NUM SLASH` + // Here works with or without scheme:// prefix + makeT(Email, tk.COLON, EmailColon); + makeMultiT(EmailColon, tk.numeric, EmailColonPort); - return S_START; + return Start; } /** @@ -356,13 +352,14 @@ export function run(start, input, tokens) { } if (sinceAccepts < 0) { - - // No accepting state was found, part of a regular text token - // Add all the tokens we looked at to the text tokens array - for (let i = cursor - multiLength; i < cursor; i++) { - textTokens.push(tokens[i]); + // No accepting state was found, part of a regular text token add + // the first text token to the text tokens array and try again from + // the next + cursor -= multiLength; + if (cursor < len) { + textTokens.push(tokens[cursor]); + cursor++; } - } else { // Accepting state! // First close off the textTokens (if available) diff --git a/packages/linkifyjs/src/core/scanner.js b/packages/linkifyjs/src/core/scanner.js index 200af610..e1ac8f08 100644 --- a/packages/linkifyjs/src/core/scanner.js +++ b/packages/linkifyjs/src/core/scanner.js @@ -16,52 +16,74 @@ import { makeChainT } from './fsm'; import * as tk from './tokens/text'; -import tlds from './tlds'; +import { tlds, utlds } from './tlds'; // Note that these two Unicode ones expand into a really big one with Babel +export const ASCII_LETTER = /[a-z]/; export const LETTER = /\p{L}/u; // Any Unicode character with letter data type export const EMOJI = /\p{Emoji}/u; // Any Unicode emoji character -export const EMOJI_VARIATION = /\uFE0F/; // Variation selector, follows heart and others +export const EMOJI_VARIATION = /\ufe0f/; // Variation selector, follows heart and others export const DIGIT = /\d/; export const SPACE = /\s/; /** - * Initialize the scanner character-based state machine for the given start state + * Initialize the scanner character-based state machine for the given start + * state + * @param {[string, boolean][]} customSchemes List of custom schemes, where each + * item is a length-2 tuple with the first element set to the string scheme, and + * the second element set to `true` if the `://` after the scheme is optional * @return {State} scanner starting state */ -export function init(customProtocols = []) { - // Frequently used states - const S_START = makeState(); - const S_NUM = makeAcceptingState(tk.NUM); - const S_DOMAIN = makeAcceptingState(tk.DOMAIN); - const S_DOMAIN_HYPHEN = makeState(); // domain followed by 1 or more hyphen characters - const S_WS = makeAcceptingState(tk.WS); - - const DOMAIN_REGEX_TRANSITIONS = [ - [DIGIT, S_DOMAIN], - [LETTER, S_DOMAIN], - [EMOJI, S_DOMAIN], - [EMOJI_VARIATION, S_DOMAIN] - ]; - - // Create a state which emits a domain token - const makeDomainState = () => { - const state = makeAcceptingState(tk.DOMAIN); - state.j = {'-': S_DOMAIN_HYPHEN }; - state.jr = [...DOMAIN_REGEX_TRANSITIONS]; +export function init(customSchemes = []) { + // Frequently used states (name argument removed during minification) + const Start = makeState('Start'); + const NonAccepting = makeState('NonAccepting'); // must never have any transitions + const Num = makeAcceptingState(tk.NUM, 'Num'); + const Word = makeAcceptingState(tk.WORD, 'Word'); + const UWord = makeAcceptingState(tk.UWORD, 'UWord'); + const Emoji = makeAcceptingState(tk.EMOJIS, 'Emoji'); + const Ws = makeAcceptingState(tk.WS, 'Ws'); + + /** + * Create a state which emits a word token + */ + const makeWordState = (name) => { + const state = makeAcceptingState(tk.WORD, name); + state.jr = [[ASCII_LETTER, Word]]; return state; }; - // Create a state which does not emit a domain state but the usual alphanumeric - // transitions are domains - const makeNearDomainState = (token) => { - const state = makeDomainState(); + /** + * Same as previous, but specific to non-ASCII alphabet words + */ + const makeUWordState = (name) => { + const state = makeAcceptingState(tk.UWORD, name); + state.jr = [[ASCII_LETTER, NonAccepting], [LETTER, UWord]]; + return state; + }; + + /** + * Create a state which does not emit a word but the usual alphanumeric + * transitions are domains + */ + const makeNearWordState = (token, name) => { + const state = makeWordState(name); + state.t = token; + return state; + }; + + /** + * Create a state which does not emit a word but the usual alphanumeric + * transitions are domains + */ + const makeNearUWordState = (token, name) => { + const state = makeUWordState(name); state.t = token; return state; }; // States for special URL symbols that accept immediately after start - makeBatchT(S_START, [ + makeBatchT(Start, [ ["'", makeAcceptingState(tk.APOSTROPHE)], ['{', makeAcceptingState(tk.OPENBRACE)], ['[', makeAcceptingState(tk.OPENBRACKET)], @@ -98,83 +120,83 @@ export function init(customProtocols = []) { // Whitespace jumps // Tokens of only non-newline whitespace are arbitrarily long - makeT(S_START, '\n', makeAcceptingState(tk.NL)); - makeRegexT(S_START, SPACE, S_WS); + makeT(Start, '\n', makeAcceptingState(tk.NL, 'Nl')); + makeRegexT(Start, SPACE, Ws); // If any whitespace except newline, more whitespace! - makeT(S_WS, '\n', makeState()); // non-accepting state - makeRegexT(S_WS, SPACE, S_WS); + makeT(Ws, '\n', makeState()); // non-accepting state + makeRegexT(Ws, SPACE, Ws); // Generates states for top-level domains // Note that this is most accurate when tlds are in alphabetical order for (let i = 0; i < tlds.length; i++) { - makeChainT(S_START, tlds[i], makeNearDomainState(tk.TLD), makeDomainState); + makeChainT(Start, tlds[i], makeNearWordState(tk.TLD), makeWordState); + } + for (let i = 0; i < utlds.length; i++) { + makeChainT(Start, utlds[i], makeNearUWordState(tk.UTLD), makeUWordState); } // Collect the states generated by different protocls - const S_PROTOCOL_FILE = makeDomainState(); - const S_PROTOCOL_FTP = makeDomainState(); - const S_PROTOCOL_HTTP = makeDomainState(); - const S_MAILTO = makeDomainState(); - makeChainT(S_START, 'file', S_PROTOCOL_FILE, makeDomainState); - makeChainT(S_START, 'ftp', S_PROTOCOL_FTP, makeDomainState); - makeChainT(S_START, 'http', S_PROTOCOL_HTTP, makeDomainState); - makeChainT(S_START, 'mailto', S_MAILTO, makeDomainState); - - // Protocol states - const S_PROTOCOL_SECURE = makeDomainState(); - const S_FULL_PROTOCOL = makeAcceptingState(tk.PROTOCOL); // Full protocol ends with COLON - const S_FULL_MAILTO = makeAcceptingState(tk.MAILTO); // Mailto ends with COLON - - // Secure protocols (end with 's') - makeT(S_PROTOCOL_FTP, 's', S_PROTOCOL_SECURE); - makeT(S_PROTOCOL_FTP, ':', S_FULL_PROTOCOL); - makeT(S_PROTOCOL_HTTP, 's', S_PROTOCOL_SECURE); - makeT(S_PROTOCOL_HTTP, ':', S_FULL_PROTOCOL); - - // Become protocol tokens after a COLON - makeT(S_PROTOCOL_FILE, ':', S_FULL_PROTOCOL); - makeT(S_PROTOCOL_SECURE, ':', S_FULL_PROTOCOL); - makeT(S_MAILTO, ':', S_FULL_MAILTO); - - // Register custom protocols - const S_CUSTOM_PROTOCOL = makeDomainState(); - for (let i = 0; i < customProtocols.length; i++) { - makeChainT(S_START, customProtocols[i], S_CUSTOM_PROTOCOL, makeDomainState); + const DefaultScheme = makeNearWordState(tk.SCHEME, 'DefaultScheme'); + const DefaultSlashScheme = makeNearWordState(tk.SLASH_SCHEME, 'DefaultSlashScheme'); + makeChainT(Start, 'file', DefaultScheme, makeWordState); + makeChainT(Start, 'mailto', DefaultScheme, makeWordState); + makeChainT(Start, 'ftp', DefaultSlashScheme, makeWordState); + makeChainT(Start, 'http', DefaultSlashScheme, makeWordState); + + // Secure (https, ftps) protocols (end with 's') + makeT(DefaultSlashScheme, 's', DefaultSlashScheme); + + // Register custom schemes + const CustomScheme = makeNearWordState(tk.SCHEME, 'CustomScheme'); + const CustomSlashScheme = makeNearWordState(tk.SLASH_SCHEME, 'CustomSlashScheme'); + const CustomCompoundScheme = makeAcceptingState(tk.SCHEME, 'CustomCompoundScheme'); + const CustomCompoundSlashScheme = makeAcceptingState(tk.SLASH_SCHEME, 'CustomCompoundSlashScheme'); + customSchemes = customSchemes.sort((a, b) => a[0] > b[0] ? 1 : -1); + for (let i = 0; i < customSchemes.length; i++) { + const schemeParts = customSchemes[i][0].split('-'); + const schemeState = schemeParts.length === 1 + ? (customSchemes[i][1] ? CustomScheme : CustomSlashScheme) + : (customSchemes[i][1] ? CustomCompoundScheme : CustomCompoundSlashScheme); + + let state = Start; + for (let j = 0; j < schemeParts.length; j++) { + let defaultStateFactory = j === 0 ? makeWordState : makeState; + let endState = j === schemeParts.length - 1 ? schemeState : defaultStateFactory(); + state = makeChainT(state, schemeParts[j], endState, defaultStateFactory); + if (schemeParts.length > 1 && j < schemeParts.length - 1) { + state = makeT(state, '-', makeState()); + } + } } - makeT(S_CUSTOM_PROTOCOL, ':', S_FULL_PROTOCOL); - // Localhost - makeChainT(S_START, 'localhost', makeNearDomainState(tk.LOCALHOST), makeDomainState); + // Localhost token + makeChainT(Start, 'localhost', makeNearWordState(tk.LOCALHOST), makeWordState); // Everything else - // DOMAINs make more DOMAINs // Number and character transitions - makeRegexT(S_START, DIGIT, S_NUM); - makeRegexT(S_START, LETTER, S_DOMAIN); - makeRegexT(S_START, EMOJI, S_DOMAIN); - makeRegexT(S_START, EMOJI_VARIATION, S_DOMAIN); - makeRegexT(S_NUM, DIGIT, S_NUM); - makeRegexT(S_NUM, LETTER, S_DOMAIN); // number becomes DOMAIN - makeRegexT(S_NUM, EMOJI, S_DOMAIN); // number becomes DOMAIN - makeRegexT(S_NUM, EMOJI_VARIATION, S_DOMAIN); // number becomes DOMAIN - makeT(S_NUM, '-', S_DOMAIN_HYPHEN); - - // Default domain transitions - makeT(S_DOMAIN, '-', S_DOMAIN_HYPHEN); - makeT(S_DOMAIN_HYPHEN, '-', S_DOMAIN_HYPHEN); - makeRegexT(S_DOMAIN, DIGIT, S_DOMAIN); - makeRegexT(S_DOMAIN, LETTER, S_DOMAIN); - makeRegexT(S_DOMAIN, EMOJI, S_DOMAIN); - makeRegexT(S_DOMAIN, EMOJI_VARIATION, S_DOMAIN); - makeRegexT(S_DOMAIN_HYPHEN, DIGIT, S_DOMAIN); - makeRegexT(S_DOMAIN_HYPHEN, LETTER, S_DOMAIN); - makeRegexT(S_DOMAIN_HYPHEN, EMOJI, S_DOMAIN); - makeRegexT(S_DOMAIN_HYPHEN, EMOJI_VARIATION, S_DOMAIN); + makeRegexT(Start, DIGIT, Num); + makeRegexT(Start, ASCII_LETTER, Word); + makeRegexT(Start, LETTER, UWord); + makeRegexT(Start, EMOJI, Emoji); + makeRegexT(Start, EMOJI_VARIATION, Emoji); // This one is sketchy + makeRegexT(Num, DIGIT, Num); + makeRegexT(Word, ASCII_LETTER, Word); + makeRegexT(UWord, ASCII_LETTER, NonAccepting); + makeRegexT(UWord, LETTER, UWord); + makeRegexT(Emoji, EMOJI, Emoji); + makeRegexT(Emoji, EMOJI_VARIATION, Emoji); + + // Account for zero-width joiner for chaining multiple emojis + // Not sure if these are actu + const EmojiJoiner = makeState(); + makeT(Emoji, '\u200d', EmojiJoiner); + makeRegexT(EmojiJoiner, EMOJI, Emoji); + makeRegexT(EmojiJoiner, EMOJI_VARIATION, Emoji); // Set default transition for start state (some symbol) - S_START.jd = makeAcceptingState(tk.SYM); - return S_START; + Start.jd = makeAcceptingState(tk.SYM, 'Sym'); + return Start; } /** @@ -188,10 +210,9 @@ export function init(customProtocols = []) { */ export function run(start, str) { // State machine is not case sensitive, so input is tokenized in lowercased - // form (still returns the regular case though) Uses selective `toLowerCase` - // is used because lowercasing the entire string causes the length and - // character position to vary in some non-English strings with V8-based - // runtimes. + // form (still returns regular case). Uses selective `toLowerCase` because + // lowercasing the entire string causes the length and character position to + // vary in some non-English strings with V8-based runtimes. const iterable = stringToArray(str.replace(/[A-Z]/g, (c) => c.toLowerCase())); const charCount = iterable.length; // <= len if there are emojis, etc const tokens = []; // return value diff --git a/packages/linkifyjs/src/core/tlds.js b/packages/linkifyjs/src/core/tlds.js index 404fa07a..6e588e99 100644 --- a/packages/linkifyjs/src/core/tlds.js +++ b/packages/linkifyjs/src/core/tlds.js @@ -2,10 +2,13 @@ // be as commonly used without the http prefix anyway and linkify will already // force-encode those. +// NOTE: vermögensberater vermögensberatung are special cases because they're +// the only ones in this list that contain non-ASCII characters + // To be updated with the values in this list // http://data.iana.org/TLD/tlds-alpha-by-domain.txt // Version 2021022800, Last Updated Sun Feb 28 07:07:01 2021 UTC -export default 'aaa \ +export const tlds = 'aaa \ aarp \ abarth \ abb \ @@ -1264,6 +1267,8 @@ ve \ vegas \ ventures \ verisign \ +vermögensberater \ +vermögensberatung \ versicherung \ vet \ vg \ @@ -1353,10 +1358,10 @@ zip \ zm \ zone \ zuerich \ -zw \ -vermögensberater-ctb \ -vermögensberatung-pwb \ -ελ \ +zw'.split(' '); + +// Internationalized domain names containing non-ASCII +export const utlds = 'ελ \ ευ \ бг \ бел \ diff --git a/packages/linkifyjs/src/core/tokens/multi.js b/packages/linkifyjs/src/core/tokens/multi.js index e6663d11..df1ebc0a 100644 --- a/packages/linkifyjs/src/core/tokens/multi.js +++ b/packages/linkifyjs/src/core/tokens/multi.js @@ -1,4 +1,4 @@ -import { PROTOCOL, SLASH } from './text'; +import { scheme, COLON } from './text'; import { defaults } from '../options'; /****************************************************************************** @@ -124,13 +124,6 @@ export function createTokenClass(type, props) { return Token; } -/** - Represents an arbitrarily mailto email address with the prefix included - @class MailtoEmail - @extends MultiToken -*/ -export const MailtoEmail = createTokenClass('email', { isLink: true }); - /** Represents a list of tokens making up a valid email address @class Email @@ -175,42 +168,12 @@ export const Url = createTokenClass('url', { @return {string} */ toHref(protocol = defaults.defaultProtocol) { - const tokens = this.tk; - let hasProtocol = false; - let hasSlashSlash = false; - let result = []; - let i = 0; - - // Make the first part of the domain lowercase - // Lowercase protocol - while (tokens[i].t === PROTOCOL) { - hasProtocol = true; - result.push(tokens[i].v); - i++; - } - - // Skip slash-slash - while (tokens[i].t === SLASH) { - hasSlashSlash = true; - result.push(tokens[i].v); - i++; - } - - // Continue pushing characters - for (; i < tokens.length; i++) { - result.push(tokens[i].v); - } - - result = result.join(''); - - if (!(hasProtocol || hasSlashSlash)) { - result = `${protocol}://${result}`; - } - - return result; + // Check if already has a prefix scheme + return this.hasProtocol() ? this.v : `${protocol}://${this.v}`; }, hasProtocol() { - return this.tk[0].t === PROTOCOL; + const tokens = this.tk; + return tokens.length >= 2 && scheme.indexOf(tokens[0].t) >= 0 && tokens[1].t === COLON; } }); diff --git a/packages/linkifyjs/src/core/tokens/text.js b/packages/linkifyjs/src/core/tokens/text.js index 85b6f0ef..a6aa958b 100644 --- a/packages/linkifyjs/src/core/tokens/text.js +++ b/packages/linkifyjs/src/core/tokens/text.js @@ -4,25 +4,43 @@ ******************************************************************************/ // A valid web domain token -export const DOMAIN = 'DOMAIN'; -export const LOCALHOST = 'LOCALHOST'; // special case of domain +export const WORD = 'WORD'; // only contains a-z +export const UWORD = 'UWORD'; // contains letters other than a-z, used for IDN -// Valid top-level domain (see tlds.js) +// Special case of word +export const LOCALHOST = 'LOCALHOST'; + +// Valid top-level domain, special case of WORD (see tlds.js) export const TLD = 'TLD'; -// Any sequence of digits 0-9 -export const NUM = 'NUM'; +// Valid IDN TLD, special case of UWORD (see tlds.js) +export const UTLD = 'UTLD'; + +// The scheme portion of a web URI protocol. Supported types include: `mailto`, +// `file`, and user-defined custom protocols. Limited to schemes that contain +// only letters +export const SCHEME = 'SCHEME'; + +// Similar to SCHEME, except makes distinction for schemes that must always be +// followed by `://`, not just `:`. Supported types include `http`, `https`, +// `ftp`, `ftps` +export const SLASH_SCHEME = 'SLASH_SCHEME'; + +// Similar to SCHEME, except contains - +export const COMPOUND_SCHEME = 'COMPOUND_SCHEME'; -// A web URL protocol. Supported types include -// - `http:` -// - `https:` -// - `ftp:` -// - `ftps:` -// - user-defined custom protocols -export const PROTOCOL = 'PROTOCOL'; +// Similar to SLASH_SCHEME, except contains - +export const COMPOUND_SLASH_SCHEME = 'COMPOUND_SLASH_SCHEME'; -// Start of the email URI protocol -export const MAILTO = 'MAILTO'; // mailto: +// TODO: Move this to keyword plugin +// Arbirary words that can keyword links +// export const KEYWORD = 'KEYWORD'; // simple [0-9a-z] +// export const UKEYWORD = 'UKEYWORD'; // containing [0-9\{Letter}] +// export const COMPOUND_KEYWORD = 'COMPOUND_KEYWORD'; // similar to KEYWORD but can have hyphens +// export const COMPOUND_UKEYWORD = 'COMPOUND_UKEYWORD'; // similar to UKEYWORD but can have hyphens + +// Any sequence of digits 0-9 +export const NUM = 'NUM'; // Any number of consecutive whitespace characters that are not newline export const WS = 'WS'; @@ -67,5 +85,19 @@ export const SLASH = 'SLASH'; // / export const TILDE = 'TILDE'; // ~ export const UNDERSCORE = 'UNDERSCORE'; // _ +// Emoji symbol +export const EMOJIS = 'EMOJIS'; + // Default token - anything that is not one of the above export const SYM = 'SYM'; + +// Token collections for grouping similar jumps in the parser +export const numeric = [NUM]; +export const ascii = [WORD, LOCALHOST, TLD, SCHEME, SLASH_SCHEME]; +export const asciinumeric = ascii.concat(NUM); +export const words = ascii.concat(UWORD, UTLD); +export const alphanumeric = words.concat(NUM); +export const domain = words.concat(COMPOUND_SCHEME, COMPOUND_SLASH_SCHEME, NUM, EMOJIS); +export const scheme = [SCHEME, SLASH_SCHEME, COMPOUND_SCHEME, COMPOUND_SLASH_SCHEME]; + +export const collections = { ascii, asciinumeric, words, alphanumeric, domain, scheme }; diff --git a/packages/linkifyjs/src/linkify.js b/packages/linkifyjs/src/linkify.js index 874f4c10..b52e6a64 100644 --- a/packages/linkifyjs/src/linkify.js +++ b/packages/linkifyjs/src/linkify.js @@ -8,7 +8,7 @@ const INIT = { scanner: null, parser: null, pluginQueue: [], - customProtocols: [], + customSchemes: [], initialized: false, }; @@ -21,7 +21,7 @@ export function reset() { INIT.scanner = null; INIT.parser = null; INIT.pluginQueue = []; - INIT.customProtocols = []; + INIT.customSchemes = []; INIT.initialized = false; } @@ -45,18 +45,20 @@ export function registerPlugin(name, plugin) { } /** - * Detect URLs with the following additional protocol. Anything following - * "protocol:" will be considered a link. + * Detect URLs with the following additional protocol. Anything with format + * "protocol://..." will be considered a link. If `optionalSlashSlash` is set to + * `true`, anything with format "protocol:..." will be considered a link. * @param {string} protocol + * @param {boolean} [optionalSlashSlash] if set to true, */ -export function registerCustomProtocol(protocol) { +export function registerCustomProtocol(protocol, optionalSlashSlash = false) { if (INIT.initialized) { - warn(`linkifyjs: already initialized - will not register custom protocol "${protocol}" until you manually call linkify.init(). To avoid this warning, please register all custom protocols before invoking linkify the first time.`); + warn(`linkifyjs: already initialized - will not register custom protocol "${protocol}" until you manually call linkify.init(). To avoid this warning, please register all custom schemes before invoking linkify the first time.`); } - if (!/^[a-z-]+$/.test(protocol)) { - throw Error('linkifyjs: protocols containing characters other than a-z or - (hyphen) are not supported'); + if (!/^[a-z]+(-[a-z]+)*$/.test(protocol)) { + throw Error('linkifyjs: incorrect protocol format.\n 1. Must only contain lowercase ASCII letters or -\n 2. Cannot start or end with -\n 3. - cannot repeat'); } - INIT.customProtocols.push(protocol); + INIT.customSchemes.push([protocol, optionalSlashSlash]); } /** @@ -65,7 +67,7 @@ export function registerCustomProtocol(protocol) { */ export function init() { // Initialize state machines - INIT.scanner = { start: scanner.init(INIT.customProtocols), tokens: scanner.tokens }; + INIT.scanner = { start: scanner.init(INIT.customSchemes), tokens: scanner.tokens }; INIT.parser = { start: parser.init(), tokens: parser.tokens }; const utils = { createTokenClass: parser.tokens.createTokenClass }; diff --git a/packages/linkifyjs/src/plugins/hashtag.js b/packages/linkifyjs/src/plugins/hashtag.js index ed3ee386..e2b47d50 100644 --- a/packages/linkifyjs/src/plugins/hashtag.js +++ b/packages/linkifyjs/src/plugins/hashtag.js @@ -5,37 +5,31 @@ import { registerPlugin } from 'linkifyjs'; export const hashtag = ({ scanner, parser, utils }) => { // Various tokens that may compose a hashtag - const { POUND, DOMAIN, TLD, LOCALHOST, UNDERSCORE } = scanner.tokens; + const { POUND, NUM, UNDERSCORE, words } = scanner.tokens; // The start state - const START_STATE = parser.start; + const Start = parser.start; // Create a new token that class that the parser emits when it finds a hashtag - const Hashtag = utils.createTokenClass('hashtag', { isLink: true }); + const HashtagToken = utils.createTokenClass('hashtag', { isLink: true }); // Take or create a transition from start to the '#' sign (non-accepting) - const HASH_STATE = START_STATE.tt(POUND); + const Hash = Start.tt(POUND); // Take transition from '#' to any text token to yield valid hashtag state - const HASHTAG_STATE = HASH_STATE.tt(DOMAIN, Hashtag); - - // Now that we have the hashtag state, no need to create new states - HASH_STATE.tt(TLD, HASHTAG_STATE); - HASH_STATE.tt(LOCALHOST, HASHTAG_STATE); + const Hashtag = Hash.tt(words, HashtagToken); + Hashtag.tt(NUM, Hashtag); + Hashtag.tt(UNDERSCORE, Hashtag); // Trailing underscore is okay + Hashtag.tt(words, Hashtag); // Account for leading underscore (non-accepting unless followed by domain) - const HASH_UNDERSCORE_STATE = HASH_STATE.tt(UNDERSCORE); - HASH_UNDERSCORE_STATE.tt(UNDERSCORE, HASH_UNDERSCORE_STATE); - HASH_UNDERSCORE_STATE.tt(DOMAIN, HASHTAG_STATE); - HASH_UNDERSCORE_STATE.tt(TLD, HASHTAG_STATE); - HASH_UNDERSCORE_STATE.tt(LOCALHOST, HASHTAG_STATE); - - // Continue the transitions - HASHTAG_STATE.tt(UNDERSCORE, HASHTAG_STATE); - HASHTAG_STATE.tt(DOMAIN, HASHTAG_STATE); - HASHTAG_STATE.tt(TLD, HASHTAG_STATE); - HASHTAG_STATE.tt(LOCALHOST, HASHTAG_STATE); - // Trailing underscore is okay + const HashPrefix = Hash.tt(NUM); + + Hash.tt(UNDERSCORE, HashPrefix); + HashPrefix.tt(NUM, HashPrefix); + HashPrefix.tt(UNDERSCORE, HashPrefix); + HashPrefix.tt(words, Hashtag); + }; diff --git a/packages/linkifyjs/src/plugins/mention.js b/packages/linkifyjs/src/plugins/mention.js index 9ab82c45..75738d20 100644 --- a/packages/linkifyjs/src/plugins/mention.js +++ b/packages/linkifyjs/src/plugins/mention.js @@ -4,10 +4,10 @@ import { registerPlugin } from 'linkifyjs'; export const mention = ({ scanner, parser, utils }) => { - const { DOMAIN, LOCALHOST, TLD, NUM, SLASH, UNDERSCORE, DOT, AT } = scanner.tokens; - const START_STATE = parser.start; + const { numeric, domain, HYPHEN, SLASH, UNDERSCORE, AT } = scanner.tokens; + const Start = parser.start; - const Mention = utils.createTokenClass('mention', { + const MentionToken = utils.createTokenClass('mention', { isLink: true, toHref() { return '/' + this.toString().substr(1); @@ -15,53 +15,27 @@ export const mention = ({ scanner, parser, utils }) => { }); // @ - const AT_STATE = START_STATE.tt(AT); // @ - - // @_, - const AT_SYMS_STATE = AT_STATE.tt(UNDERSCORE); - - // @_* - AT_SYMS_STATE.tt(UNDERSCORE, AT_SYMS_STATE); - AT_SYMS_STATE.tt(DOT, AT_SYMS_STATE); + const At = Start.tt(AT); // @ // Valid mention (not made up entirely of symbols) - const MENTION_STATE = AT_STATE.tt(DOMAIN, Mention); - AT_STATE.tt(TLD, MENTION_STATE); - AT_STATE.tt(LOCALHOST, MENTION_STATE); - AT_STATE.tt(NUM, MENTION_STATE); - - // @[_.]* + valid mention - AT_SYMS_STATE.tt(DOMAIN, MENTION_STATE); - AT_SYMS_STATE.tt(LOCALHOST, MENTION_STATE); - AT_SYMS_STATE.tt(TLD, MENTION_STATE); - AT_SYMS_STATE.tt(NUM, MENTION_STATE); + const Mention = At.tt(domain, MentionToken); + At.tt(numeric, Mention); + At.tt(UNDERSCORE, Mention); // More valid mentions - MENTION_STATE.tt(DOMAIN, MENTION_STATE); - MENTION_STATE.tt(LOCALHOST, MENTION_STATE); - MENTION_STATE.tt(TLD, MENTION_STATE); - MENTION_STATE.tt(NUM, MENTION_STATE); - MENTION_STATE.tt(UNDERSCORE, MENTION_STATE); + Mention.tt(domain, Mention); + Mention.tt(numeric, Mention); + Mention.tt(UNDERSCORE, Mention); + Mention.tt(HYPHEN, Mention); // Mention with a divider - const MENTION_DIVIDER_STATE = MENTION_STATE.tt(SLASH); - MENTION_STATE.tt(SLASH, MENTION_DIVIDER_STATE); - MENTION_STATE.tt(DOT, MENTION_DIVIDER_STATE); - MENTION_STATE.tt(AT, MENTION_DIVIDER_STATE); - - // Mention _ trailing stash plus syms - const MENTION_DIVIDER_SYMS_STATE = MENTION_DIVIDER_STATE.tt(UNDERSCORE); - MENTION_DIVIDER_SYMS_STATE.tt(UNDERSCORE, MENTION_DIVIDER_SYMS_STATE); + const MentionDivider = Mention.tt(SLASH); // Once we get a word token, mentions can start up again - MENTION_DIVIDER_STATE.tt(DOMAIN, MENTION_STATE); - MENTION_DIVIDER_STATE.tt(LOCALHOST, MENTION_STATE); - MENTION_DIVIDER_STATE.tt(TLD, MENTION_STATE); - MENTION_DIVIDER_STATE.tt(NUM, MENTION_STATE); - MENTION_DIVIDER_SYMS_STATE.tt(DOMAIN, MENTION_STATE); - MENTION_DIVIDER_SYMS_STATE.tt(LOCALHOST, MENTION_STATE); - MENTION_DIVIDER_SYMS_STATE.tt(TLD, MENTION_STATE); - MENTION_DIVIDER_SYMS_STATE.tt(NUM, MENTION_STATE); + MentionDivider.tt(domain, Mention); + MentionDivider.tt(numeric, Mention); + MentionDivider.tt(UNDERSCORE, Mention); + MentionDivider.tt(HYPHEN, Mention); }; registerPlugin('mention', mention); diff --git a/packages/linkifyjs/src/plugins/ticket.js b/packages/linkifyjs/src/plugins/ticket.js index df3864f9..b6301fe3 100644 --- a/packages/linkifyjs/src/plugins/ticket.js +++ b/packages/linkifyjs/src/plugins/ticket.js @@ -6,12 +6,12 @@ import { registerPlugin } from 'linkifyjs'; export const ticket = ({ scanner, parser, utils }) => { // TODO: Add cross-repo style tickets? e.g., Hypercontext/linkifyjs#42 // Is that even feasible? - const { POUND, NUM } = scanner.tokens; - const START_STATE = parser.start; - const Ticket = utils.createTokenClass('ticket', { isLink: true }); + const { POUND, numeric } = scanner.tokens; + const Start = parser.start; + const TicketToken = utils.createTokenClass('ticket', { isLink: true }); - const HASH_STATE = START_STATE.tt(POUND); - HASH_STATE.tt(NUM, Ticket); + const Hash = Start.tt(POUND); + Hash.tt(numeric, TicketToken); }; registerPlugin('ticket', ticket); diff --git a/test/benchmarks.js b/test/benchmarks.js index 13470b1b..7de244a4 100644 --- a/test/benchmarks.js +++ b/test/benchmarks.js @@ -3,8 +3,8 @@ var ITERATIONS = 500; function bench1(linkify) { // eslint-disable-next-line no-debugger debugger; // prevents V8 optimization - delete require.cache[require.resolve('../lib/linkify')]; - linkify = require('../lib/linkify'); + delete require.cache[require.resolve('linkifyjs')]; + linkify = require('linkifyjs'); // linkify.init(); linkify.find(''); // delete require.cache[require.resolve('moment')]; @@ -252,7 +252,7 @@ mailto:bar`); [bench1, bench2].forEach((bench) => { debugger; // var usageInitial = process.memoryUsage(); - var linkify = require('../lib/linkify'); + var linkify = require('linkifyjs'); linkify.init(); // var usageLinkify = process.memoryUsage(); diff --git a/test/spec/core/parser.test.js b/test/spec/core/parser.test.js index 2bb5ec26..183a8f56 100644 --- a/test/spec/core/parser.test.js +++ b/test/spec/core/parser.test.js @@ -1,7 +1,7 @@ const { expect } = require('chai'); const scanner = require('linkifyjs/src/core/scanner'); const parser = require('linkifyjs/src/core/parser'); -const { Text, Url, Email, MailtoEmail } = require('linkifyjs/src/core/tokens/multi'); +const { Text, Url, Email } = require('linkifyjs/src/core/tokens/multi'); /** [0] - Original text to parse (should tokenize first) @@ -106,15 +106,15 @@ const tests = [ ['Emails cannot have two dots, e.g.: nick..', 'f@yahoo.ca'] ], [ 'The `mailto:` part should be included in mailto:this.is.a.test@yandex.ru', - [Text, MailtoEmail], + [Text, Url], ['The `mailto:` part should be included in ', 'mailto:this.is.a.test@yandex.ru'] ], [ 'mailto:echalk-dev@logicify.com?Subject=Hello%20again is another test', - [MailtoEmail, Text], + [Url, Text], ['mailto:echalk-dev@logicify.com?Subject=Hello%20again', ' is another test'] ], [ 'Mailto is greedy mailto:localhost?subject=Hello%20World.', - [Text, MailtoEmail, Text], + [Text, Url, Text], ['Mailto is greedy ', 'mailto:localhost?subject=Hello%20World', '.'] ], [ 'Emails like: test@42.domain.com and test@42.abc.11.domain.com should be matched in its entirety.', @@ -196,6 +196,22 @@ const tests = [ 'o\'malley@example.com.au', // Email with apostrophe [Email], ['o\'malley@example.com.au'] + ], [ + 'foohttp://example.com bar', + [Text, Url, Text], + ['foohttp://', 'example.com', ' bar'], + ], [ + 'テストhttp://example.comテスト', + [Text, Url], + ['テスト', 'http://example.comテスト'], + ], [ + 'file:/etc/motd', + [Url], + ['file:/etc/motd'] + ], [ + 'file:///etc/motd', + [Url], + ['file:///etc/motd'] ] ]; @@ -216,6 +232,7 @@ describe('linkifyjs/core/parser#run()', () => { }); } + // eslint-disable-next-line mocha/no-setup-in-describe tests.map(makeTest, this); it('Correctly sets start and end indexes', () => { diff --git a/test/spec/core/scanner.test.js b/test/spec/core/scanner.test.js index dc84d451..c0130a2b 100644 --- a/test/spec/core/scanner.test.js +++ b/test/spec/core/scanner.test.js @@ -29,58 +29,86 @@ const tests = [ ['&?<>(', [t.AMPERSAND, t.QUERY, t.OPENANGLEBRACKET, t.CLOSEANGLEBRACKET, t.OPENPAREN], ['&', '?', '<', '>', '(']], ['([{}])', [t.OPENPAREN, t.OPENBRACKET, t.OPENBRACE, t.CLOSEBRACE, t.CLOSEBRACKET, t.CLOSEPAREN], ['(', '[', '{', '}', ']', ')']], ['!,;\'', [t.EXCLAMATION, t.COMMA, t.SEMI, t.APOSTROPHE], ['!', ',', ';', '\'']], - ['hello', [t.DOMAIN], ['hello']], - ['Hello123', [t.DOMAIN], ['Hello123']], - ['hello123world', [t.DOMAIN], ['hello123world']], + ['hello', [t.WORD], ['hello']], + ['Hello123', [t.WORD, t.NUM], ['Hello', '123']], + ['hello123world', [t.WORD, t.NUM, t.TLD], ['hello', '123', 'world']], ['0123', [t.NUM], ['0123']], - ['123abc', [t.DOMAIN], ['123abc']], - ['http', [t.DOMAIN], ['http']], - ['http:', [t.PROTOCOL], ['http:']], - ['https:', [t.PROTOCOL], ['https:']], - ['files:', [t.DOMAIN, t.COLON], ['files', ':']], - ['file//', [t.DOMAIN, t.SLASH, t.SLASH], ['file', '/', '/']], - ['ftp://', [t.PROTOCOL, t.SLASH, t.SLASH], ['ftp:', '/', '/']], - ['mailto', [t.DOMAIN], ['mailto']], - ['mailto:', [t.MAILTO], ['mailto:']], - ['c', [t.DOMAIN], ['c']], + ['123abc', [t.NUM, t.TLD], ['123', 'abc']], + ['http', [t.SLASH_SCHEME], ['http']], + ['http:', [t.SLASH_SCHEME, t.COLON], ['http', ':']], + ['https:', [t.SLASH_SCHEME, t.COLON], ['https', ':']], + ['files:', [t.WORD, t.COLON], ['files', ':']], + ['file//', [t.SCHEME, t.SLASH, t.SLASH], ['file', '/', '/']], + ['ftp://', [t.SLASH_SCHEME, t.COLON, t.SLASH, t.SLASH], ['ftp', ':', '/', '/']], + ['mailto', [t.SCHEME], ['mailto']], + ['mailto:', [t.SCHEME, t.COLON], ['mailto', ':']], + ['c', [t.WORD], ['c']], ['co', [t.TLD], ['co']], ['com', [t.TLD], ['com']], - ['comm', [t.DOMAIN], ['comm']], - ['abc 123 DoReMi', [t.TLD, t.WS, t.NUM, t.WS, t.DOMAIN], ['abc', ' ', '123', ' ', 'DoReMi']], - ['abc 123 \n DoReMi', [t.TLD, t.WS, t.NUM, t.WS, t.NL, t.WS, t.DOMAIN], ['abc', ' ', '123', ' ', '\n', ' ', 'DoReMi']], - ['local', [t.DOMAIN], ['local']], + ['comm', [t.WORD], ['comm']], + ['vermögensberater السعودية москва', [t.TLD, t.WS, t.UTLD, t.WS, t.UTLD], ['vermögensberater', ' ', 'السعودية', ' ', 'москва']], + ['abc 123 DoReMi', [t.TLD, t.WS, t.NUM, t.WS, t.WORD], ['abc', ' ', '123', ' ', 'DoReMi']], + ['abc 123 \n DoReMi', [t.TLD, t.WS, t.NUM, t.WS, t.NL, t.WS, t.WORD], ['abc', ' ', '123', ' ', '\n', ' ', 'DoReMi']], + ['local', [t.WORD], ['local']], ['localhost', [t.LOCALHOST], ['localhost']], - ['localhosts', [t.DOMAIN], ['localhosts']], - ['500px', [t.DOMAIN], ['500px']], - ['500-px', [t.DOMAIN], ['500-px']], - ['-500px', [t.HYPHEN, t.DOMAIN], ['-', '500px']], - ['500px-', [t.DOMAIN, t.HYPHEN], ['500px', '-']], - ['123-456', [t.DOMAIN], ['123-456']], + ['localhosts', [t.WORD], ['localhosts']], + ['500px', [t.NUM, t.WORD], ['500', 'px']], + ['500-px', [t.NUM, t.HYPHEN, t.WORD], ['500', '-', 'px']], + ['-500px', [t.HYPHEN, t.NUM, t.WORD], ['-', '500', 'px']], + ['500px-', [t.NUM, t.WORD, t.HYPHEN], ['500', 'px', '-']], + ['123-456', [t.NUM, t.HYPHEN, t.NUM], ['123', '-', '456']], ['foo\u00a0bar', [t.TLD, t.WS, t.TLD], ['foo', '\u00a0', 'bar']], // nbsp - ['çïrâ.ca', [t.DOMAIN, t.DOT, t.TLD], ['çïrâ', '.', 'ca']], - ['www.🍕💩.ws', [t.DOMAIN, t.DOT, t.DOMAIN, t.DOT, t.TLD], ['www', '.', '🍕💩', '.', 'ws']], + ['çïrâ.ca', [t.UWORD, t.WORD, t.UWORD, t.DOT, t.TLD], ['çï', 'r', 'â', '.', 'ca']], + ['❤️💚', [t.EMOJIS], ['❤️💚']], + ['👊🏿🧑🏼‍🔬🌚', [t.EMOJIS], ['👊🏿🧑🏼‍🔬🌚']], // contains zero-width joiner \u200d + ['www.🍕💩.ws', [t.WORD, t.DOT, t.EMOJIS, t.DOT, t.TLD], ['www', '.', '🍕💩', '.', 'ws']], [ 'za̡͊͠͝lgό.gay', // May support diacritics in the future if someone complains - [t.TLD, t.SYM, t.SYM, t.SYM, t.SYM, t.DOMAIN, t.DOT, t.TLD], - ['za', '͠', '̡', '͊', '͝', 'lgό','.','gay'] + [t.TLD, t.SYM, t.SYM, t.SYM, t.SYM, t.WORD, t.UWORD, t.DOT, t.TLD], + ['za', '͠', '̡', '͊', '͝', 'lg', 'ό','.','gay'] ], [ 'Direniş İzleme Grubu\'nun', - [t.DOMAIN, t.WS, t.DOMAIN, t.WS, t.DOMAIN, t.APOSTROPHE, t.DOMAIN], - ['Direniş', ' ', 'İzleme', ' ', 'Grubu', '\'', 'nun'] + [t.WORD, t.UWORD, t.WS, t.UWORD, t.WORD, t.WS, t.WORD, t.APOSTROPHE, t.WORD], + ['Direni', 'ş', ' ', 'İ', 'zleme', ' ', 'Grubu', '\'', 'nun'] ], [ 'example.com   テスト', // spaces are ideographic space - [t.DOMAIN, t.DOT, t.TLD, t.WS, t.DOMAIN], + [t.WORD, t.DOT, t.TLD, t.WS, t.UWORD], ['example', '.', 'com', '   ', 'テスト'] ], [ '#АБВ_бв #한글 #سلام', - [t.POUND, t.DOMAIN, t.UNDERSCORE, t.DOMAIN, t.WS, t.POUND, t.DOMAIN, t.WS, t.POUND, t.DOMAIN], + [t.POUND, t.UWORD, t.UNDERSCORE, t.UWORD, t.WS, t.POUND, t.UWORD, t.WS, t.POUND, t.UWORD], ['#', 'АБВ', '_', 'бв', ' ', '#', '한글', ' ', '#', 'سلام'] + ], + [ + 'テストexample.comテスト', + [t.UWORD, t.WORD, t.DOT, t.TLD, t.UWORD], + ['テスト', 'example', '.', 'com', 'テスト'] + ], + [ + 'テストhttp://example.comテスト', + [t.UWORD, t.SLASH_SCHEME, t.COLON, t.SLASH, t.SLASH, t.WORD, t.DOT, t.TLD, t.UWORD], + ['テスト', 'http', ':', '/', '/', 'example', '.', 'com', 'テスト'] ] ]; +const customSchemeTests = [ + ['stea', [t.WORD], ['stea']], + ['steam', [t.SCHEME], ['steam']], + ['steams', [t.WORD], ['steams']], + ['view', [t.WORD], ['view']], + ['view-', [t.WORD, t.HYPHEN], ['view', '-']], + ['view-s', [t.WORD, t.HYPHEN, t.WORD], ['view', '-', 's']], + ['view-sour', [t.WORD, t.HYPHEN, t.WORD], ['view', '-', 'sour']], + ['view-source', [t.SLASH_SCHEME], ['view-source']], + ['view-sources', [t.SLASH_SCHEME, t.WORD], ['view-source', 's']], // This is an unfortunate consequence :( + ['fb', [t.SLASH_SCHEME], ['fb']], + ['twitter sux', [t.SLASH_SCHEME, t.WS, t.WORD], ['twitter', ' ', 'sux']], + ['ms-settings', [t.SCHEME], ['ms-settings']], +]; + describe('linkifyjs/core/scanner#run()', () => { let start; @@ -103,7 +131,7 @@ describe('linkifyjs/core/scanner#run()', () => { it('Correctly sets start and end indexes', () => { expect(scanner.run(start, 'Hello, World!')).to.eql([ - { t: t.DOMAIN, v: 'Hello', s: 0, e: 5 }, + { t: t.WORD, v: 'Hello', s: 0, e: 5 }, { t: t.COMMA, v: ',', s: 5, e: 6 }, { t: t.WS, v: ' ', s: 6, e: 7 }, { t: t.TLD, v: 'World', s: 7, e: 12 }, @@ -112,22 +140,35 @@ describe('linkifyjs/core/scanner#run()', () => { }); describe('Custom protocols', () => { - before(() => { start = scanner.init(['twitter', 'fb', 'steam']); }); + + before(() => { + start = scanner.init([ + ['twitter', false], + ['fb', false], + ['steam', true], + ['view-source', false], + ['ms-settings', true] + ]); + }); + + // eslint-disable-next-line mocha/no-setup-in-describe + customSchemeTests.map(makeTest, this); it('Correctly tokenizes a full custom protocols', () => { expect(scanner.run(start, 'steam://hello')).to.eql([ - { t: t.PROTOCOL, v: 'steam:', s: 0, e: 6 }, + { t: t.SCHEME, v: 'steam', s: 0, e: 5 }, + { t: t.COLON, v: ':', s: 5, e: 6 }, { t: t.SLASH, v: '/', s: 6, e: 7 }, { t: t.SLASH, v: '/', s: 7, e: 8 }, - { t: t.DOMAIN, v: 'hello', s: 8, e: 13 } + { t: t.WORD, v: 'hello', s: 8, e: 13 } ]); }); - it('Classifies partial custom protocols as domains', () => { + it('Classifies partial schemes', () => { expect(scanner.run(start, 'twitter sux')).to.eql([ - { t: t.DOMAIN, v: 'twitter', s: 0, e: 7 }, + { t: t.SLASH_SCHEME, v: 'twitter', s: 0, e: 7 }, { t: t.WS, v: ' ', s: 7, e: 8 }, - { t: t.DOMAIN, v: 'sux', s: 8, e: 11 } + { t: t.WORD, v: 'sux', s: 8, e: 11 } ]); }); }); diff --git a/test/spec/core/tokens/multi.test.js b/test/spec/core/tokens/multi.test.js index a4126bdd..44952cb3 100644 --- a/test/spec/core/tokens/multi.test.js +++ b/test/spec/core/tokens/multi.test.js @@ -16,48 +16,38 @@ describe('linkifyjs/core/tokens/multi', () => { describe('Url', () => { let input1 = 'Ftps://www.github.com/Hypercontext/linkify'; - let input2 = '//Amazon.ca/Sales'; - let input3 = 'co.co?o=%2D&p=@gc#wat'; - let url1, url2, url3; + let input2 = 'co.co/?o=%2D&p=@gc#wat'; + let url1, url2; before(() => { const urlTextTokens1 = scanner.run(scannerStart, input1); const urlTextTokens2 = scanner.run(scannerStart, input2); - const urlTextTokens3 = scanner.run(scannerStart, input3); url1 = new mtk.Url(input1, urlTextTokens1); url2 = new mtk.Url(input2, urlTextTokens2); - url3 = new mtk.Url(input3, urlTextTokens3); }); describe('#isLink', () => { it('Is true in all cases', () => { expect(url1.isLink).to.be.ok; expect(url2.isLink).to.be.ok; - expect(url3.isLink).to.be.ok; }); }); describe('#toString()', () => { it('Returns the exact URL text', () => { expect(url1.toString()).to.be.eql('Ftps://www.github.com/Hypercontext/linkify'); - expect(url2.toString()).to.be.eql('//Amazon.ca/Sales'); - expect(url3.toString()).to.be.eql('co.co?o=%2D&p=@gc#wat'); + expect(url2.toString()).to.be.eql('co.co/?o=%2D&p=@gc#wat'); }); }); describe('#toHref()', () => { - it('Keeps the protocol the same as the original URL (and lowercases it)', () => { + it('Keeps the protocol the same as the original URL', () => { expect(url1.toHref()).to.be.eql('Ftps://www.github.com/Hypercontext/linkify'); }); - it('Lowercases the domain name only and leaves off the protocol if the URL begins with "//"', () => { - expect(url2.toHref()).to.be.eql('//Amazon.ca/Sales'); - }); - it('Adds a default protocol, if required', () => { - expect(url3.toHref()).to.be.eql('http://co.co?o=%2D&p=@gc#wat'); - expect(url3.toHref('ftp')).to.be.eql('ftp://co.co?o=%2D&p=@gc#wat'); + expect(url2.toHref()).to.be.eql('http://co.co/?o=%2D&p=@gc#wat'); }); }); @@ -73,23 +63,14 @@ describe('linkifyjs/core/tokens/multi', () => { end: input1.length }); - expect(url2.toObject()).to.be.eql({ + expect(url2.toObject('https')).to.be.eql({ type: 'url', value: input2, - href: input2, + href: 'https://co.co/?o=%2D&p=@gc#wat', isLink: true, start: 0, end: input2.length }); - - expect(url3.toObject('https')).to.be.eql({ - type: 'url', - value: input3, - href: 'https://co.co?o=%2D&p=@gc#wat', - isLink: true, - start: 0, - end: input3.length - }); }); }); @@ -99,7 +80,6 @@ describe('linkifyjs/core/tokens/multi', () => { }); it('Tests false when there is no protocol', () => { expect(url2.hasProtocol()).to.not.be.ok; - expect(url3.hasProtocol()).to.not.be.ok; }); }); @@ -140,7 +120,7 @@ describe('linkifyjs/core/tokens/multi', () => { before(() => { const emailTextTokens = scanner.run(scannerStart, input); - email = new mtk.MailtoEmail(input, emailTextTokens); + email = new mtk.Url(input, emailTextTokens); }); describe('#isLink', () => { diff --git a/test/spec/linkifyjs.test.js b/test/spec/linkifyjs.test.js index 9f57f7f1..1d7054ca 100644 --- a/test/spec/linkifyjs.test.js +++ b/test/spec/linkifyjs.test.js @@ -1,7 +1,71 @@ /* eslint-disable mocha/no-setup-in-describe */ +const { expect } = require('chai'); const linkify = require('linkifyjs/src/linkify'); +const ticketPlugin = ({ scanner, parser, utils }) => { + const { POUND, numeric } = scanner.tokens; + const TicketToken = utils.createTokenClass('ticket', { isLink: true }); + const Hash = parser.start.tt(POUND); + Hash.tt(numeric, TicketToken); +}; + describe('linkifyjs', () => { + describe('registerPlugin', () => { + beforeEach(() => { + linkify.registerPlugin('ticket', ticketPlugin); + }); + + it('Detects tickets after applying', () => { + expect(linkify.test('#123', 'ticket')).to.be.ok; + }); + + it('Logs a warning if registering same plugin twice', () => { + linkify.registerPlugin('ticket', ticketPlugin); + expect(linkify.test('#123', 'ticket')).to.be.ok; + }); + + it('Logs a warning if already initialized', () => { + linkify.init(); + linkify.registerPlugin('ticket2', ticketPlugin); + }); + }); + + describe('registerCustomProtocol', () => { + beforeEach(() => { + linkify.registerCustomProtocol('instagram', true); + linkify.registerCustomProtocol('view-source'); + }); + + it('Detects basic protocol', () => { + expect(linkify.test('instagram:user/nfrasser', 'url')).to.be.ok; + }); + + it('Detects basic protocol with slash slash', () => { + expect(linkify.test('instagram://user/nfrasser', 'url')).to.be.ok; + }); + + it('Detects compound protocol', () => { + expect(linkify.test('view-source://http://github.com/', 'url')).to.be.ok; + }); + + it('Does not detect protocol with non-optional //', () => { + expect(linkify.test('view-source:http://github.com/', 'url')).to.not.be.ok; + }); + + it('Does not detect custom protocol if already initialized', () => { + linkify.init(); + linkify.registerCustomProtocol('fb'); + expect(linkify.test('fb://feed')).to.not.be.ok; + }); + + it('Throws error when protocol has invalid format', () => { + expect(() => linkify.registerCustomProtocol('-')).to.throw(); + expect(() => linkify.registerCustomProtocol('-fb')).to.throw(); + expect(() => linkify.registerCustomProtocol('fb-')).to.throw(); + expect(() => linkify.registerCustomProtocol('git+https')).to.throw(); // this may work in the future + }); + }); + describe('tokenize', () => { it('is a function', () => { expect(linkify.tokenize).to.be.a('function'); @@ -15,9 +79,40 @@ describe('linkifyjs', () => { it('is a function', () => { expect(linkify.find).to.be.a('function'); }); + it('takes a single argument', () => { expect(linkify.find.length).to.be.eql(1); // type is optional }); + + it('Find nothing in an empty string', () => { + expect(linkify.find('')).to.deep.eql([]); + }); + + it('Find nothing in a string with no links', () => { + expect(linkify.find('Hello World!')).to.deep.eql([]); + }); + + it('Find the link', () => { + expect(linkify.find('hello.world!')).to.deep.eql([{ + type: 'url', + value: 'hello.world', + href: 'http://hello.world', + isLink: true, + start: 0, + end: 11 + }]); + }); + + it('Find the link of the specific type', () => { + expect(linkify.find('For help with github.com, please contact support@example.com', 'email')).to.deep.eql([{ + type: 'email', + value: 'support@example.com', + href: 'mailto:support@example.com', + isLink: true, + start: 41, + end: 60 + }]); + }); }); describe('test', () => { @@ -38,7 +133,7 @@ describe('linkifyjs', () => { ['test+4@uwaterloo.ca', true], ['test+4@uwaterloo.ca', false, 'url'], ['test+4@uwaterloo.ca', true, 'email'], - ['mailto:test+5@uwaterloo.ca', true, 'email'], + ['mailto:test+5@uwaterloo.ca', true, 'url'], ['t.co', true], ['t.co g.co', false], // can only be one ['test@g.co t.co', false] // can only be one diff --git a/test/spec/plugins/hashtag.test.js b/test/spec/plugins/hashtag.test.js index 396015a8..c8dc1120 100644 --- a/test/spec/plugins/hashtag.test.js +++ b/test/spec/plugins/hashtag.test.js @@ -13,9 +13,12 @@ describe('plugins/hashtag', () => { }); describe('after plugin is applied', () => { - it ('can parse hashtags after applying the plugin', () => { + beforeEach(() => { linkify.registerPlugin('hashtag', hashtag); - expect(linkify.find('There is a #hashtag #YOLO-2015 #__swag__ and #1234 and #%^&*( #_ #__ should not work')) + }); + + it ('can parse hashtags after applying the plugin', () => { + expect(linkify.find('There is a #hashtag #YOLO_2015 #__swag__ and #1234 and #%^&*( #_ #__ should not work')) .to.be.eql([{ type: 'hashtag', value: '#hashtag', @@ -25,8 +28,8 @@ describe('plugins/hashtag', () => { end: 19 }, { type: 'hashtag', - value: '#YOLO-2015', - href: '#YOLO-2015', + value: '#YOLO_2015', + href: '#YOLO_2015', isLink: true, start: 20, end: 30 @@ -38,12 +41,50 @@ describe('plugins/hashtag', () => { start: 31, end: 40 }]); + }); + it('Works with basic hashtags', () => { expect(linkify.test('#wat', 'hashtag')).to.be.ok; + }); + + it('Works with trailing underscores', () => { expect(linkify.test('#bug_', 'hashtag')).to.be.ok; + }); + + it('Works with underscores', () => { expect(linkify.test('#bug_test', 'hashtag')).to.be.ok; + }); + + it('Works with double underscores', () => { expect(linkify.test('#bug__test', 'hashtag')).to.be.ok; + }); + + it('Works with number prefix', () => { + expect(linkify.test('#123abc', 'hashtag')).to.be.ok; + }); + + it('Works with number/underscore prefix', () => { + expect(linkify.test('#123_abc', 'hashtag')).to.be.ok; + }); + + it('Works with Hangul characters', () => { + expect(linkify.test('#일상', 'hashtag')).to.be.ok; + }); + + it('Works with Cyrillic characters', () => { + expect(linkify.test('#АБВ_бв', 'hashtag')).to.be.ok; + }); + + it('Works with Arabic characters', () => { + expect(linkify.test('#سلام', 'hashtag')).to.be.ok; + }); + + it('Does not work with just numbers', () => { expect(linkify.test('#987', 'hashtag')).to.not.be.ok; }); + + it('Does not work with just numbers and underscore', () => { + expect(linkify.test('#987_654', 'hashtag')).to.not.be.ok; + }); }); }); diff --git a/test/spec/plugins/mention.test.js b/test/spec/plugins/mention.test.js index f75481ae..163e505b 100644 --- a/test/spec/plugins/mention.test.js +++ b/test/spec/plugins/mention.test.js @@ -1,3 +1,4 @@ +const { expect } = require('chai'); const linkify = require('linkifyjs'); const { mention } = require('linkifyjs/src/plugins/mention'); @@ -9,7 +10,7 @@ describe('plugins/mention', () => { .to.be.eql([]); expect(linkify.test('@wat', 'mention')).to.not.be.ok; - expect(linkify.test('@987', 'mention')).to.not.be.ok; + expect(linkify.test('@007', 'mention')).to.not.be.ok; }); describe('after plugin is applied', () => { @@ -76,36 +77,6 @@ describe('plugins/mention', () => { }]); }); - it('parses mentions with email syntax', () => { - expect(linkify.find('Hey @developers@soapbox')).to.deep.equal([{ - type: 'mention', - value: '@developers@soapbox', - href: '/developers@soapbox', - isLink: true, - start: 4, - end: 23 - }]); - - expect(linkify.find('Hey @developers@soapbox.example.com')).to.deep.equal([{ - type: 'mention', - value: '@developers@soapbox.example.com', - href: '/developers@soapbox.example.com', - isLink: true, - start: 4, - end: 35 - }]); - - expect(linkify.find('Hey @developers@soapbox you can mail me at someone@soapbox')).to.deep.equal([{ - type: 'mention', - value: '@developers@soapbox', - href: '/developers@soapbox', - isLink: true, - start: 4, - end: 23 - }]); - - }); - it('parses github team-style mentions with slashes', () => { expect(linkify.find('Hey @500px/web please review this')).to.deep.equal([{ type: 'mention', @@ -135,29 +106,29 @@ describe('plugins/mention', () => { }]); }); - it('parses mentions with dots', () => { + it('parses mentions with dots (ignores past the dots)', () => { expect(linkify.find('Hey @john.doe please review this')).to.deep.equal([{ type: 'mention', - value: '@john.doe', - href: '/john.doe', + value: '@john', + href: '/john', isLink: true, start: 4, - end: 13 + end: 9 }]); }); it('ignores extra dots at the end of mentions', () => { - expect(linkify.find('We should get ...@soapbox._developers.@soapbox.cs.... to be awesome')).to.deep.equal([{ + expect(linkify.find('We should get ...@soapbox-_developers.@soapbox_cs.... to be awesome')).to.deep.equal([{ type: 'mention', - value: '@soapbox._developers', - href: '/soapbox._developers', + value: '@soapbox-_developers', + href: '/soapbox-_developers', isLink: true, start: 17, end: 37 }, { type: 'mention', - value: '@soapbox.cs', - href: '/soapbox.cs', + value: '@soapbox_cs', + href: '/soapbox_cs', isLink: true, start: 38, end: 49 @@ -169,7 +140,7 @@ describe('plugins/mention', () => { }); it('ignores text only made up of symbols', () => { - expect(linkify.find('Is @- or @__ a person? What about @%_% no, probably not')).to.deep.equal([]); + expect(linkify.find('Is @- or @~! a person? What about @%_% no, probably not')).to.deep.equal([]); }); it('ignores punctuation at the end of mentions', () => { @@ -221,6 +192,26 @@ describe('plugins/mention', () => { end: 25 }]); }); + + it('detects trailing hyphen', () => { + expect(linkify.test('@123-', 'mention')).to.be.ok; + }); + + it('detects interjecting hyphen', () => { + expect(linkify.test('@123-abc', 'mention')).to.be.ok; + }); + + it('detects single underscore', () => { + expect(linkify.test('@_', 'mention')).to.be.ok; + }); + + it('detects multiple underscore', () => { + expect(linkify.test('@__', 'mention')).to.be.ok; + }); + + it('ignores interjecting dot', () => { + expect(linkify.test('@hello.world', 'mention')).to.not.be.ok; + }); }); afterEach(() => { linkify.reset(); });