Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Total rework of Emphasis/Strong #1864

Merged
merged 15 commits into from
Feb 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 73 additions & 91 deletions lib/marked.esm.js
Original file line number Diff line number Diff line change
Expand Up @@ -856,46 +856,61 @@ var Tokenizer_1 = class Tokenizer {
}
}

strong(src, maskedSrc, prevChar = '') {
let match = this.rules.inline.strong.start.exec(src);
emStrong(src, maskedSrc, prevChar = '') {
let match = this.rules.inline.emStrong.lDelim.exec(src);
if (!match) return;

if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) {
maskedSrc = maskedSrc.slice(-1 * src.length);
const endReg = match[0] === '**' ? this.rules.inline.strong.endAst : this.rules.inline.strong.endUnd;
if (match[3] && prevChar.match(/[\p{L}\p{N}]/u)) return; // _ can't be between two alphanumerics. \p{L}\p{N} includes non-english alphabet/numbers as well

const nextChar = match[1] || match[2] || '';

if (!nextChar || (nextChar && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) {
const lLength = match[0].length - 1;
let rDelim, rLength, delimTotal = lLength, midDelimTotal = 0;

const endReg = match[0][0] === '*' ? this.rules.inline.emStrong.rDelimAst : this.rules.inline.emStrong.rDelimUnd;
endReg.lastIndex = 0;

let cap;
maskedSrc = maskedSrc.slice(-1 * src.length + lLength); // Bump maskedSrc to same section of string as src (move to lexer?)

while ((match = endReg.exec(maskedSrc)) != null) {
cap = this.rules.inline.strong.middle.exec(maskedSrc.slice(0, match.index + 3));
if (cap) {
return {
type: 'strong',
raw: src.slice(0, cap[0].length),
text: src.slice(2, cap[0].length - 2)
};
rDelim = match[1] || match[2] || match[3] || match[4] || match[5] || match[6];

if (!rDelim) continue; // matched the first alternative in rules.js (skip the * in __abc*abc__)

rLength = rDelim.length;

if (match[3] || match[4]) { // found another Left Delim
delimTotal += rLength;
continue;
} else if (match[5] || match[6]) { // either Left or Right Delim
if (lLength % 3 && !((lLength + rLength) % 3)) {
midDelimTotal += rLength;
continue; // CommonMark Emphasis Rules 9-10
}
}
}
}
}

em(src, maskedSrc, prevChar = '') {
let match = this.rules.inline.em.start.exec(src);
delimTotal -= rLength;

if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) {
maskedSrc = maskedSrc.slice(-1 * src.length);
const endReg = match[0] === '*' ? this.rules.inline.em.endAst : this.rules.inline.em.endUnd;
if (delimTotal > 0) continue; // Haven't found enough closing delimiters

endReg.lastIndex = 0;
// If this is the last rDelimiter, remove extra characters. *a*** -> *a*
if (delimTotal + midDelimTotal - rLength <= 0 && !maskedSrc.slice(endReg.lastIndex).match(endReg)) {
rLength = Math.min(rLength, rLength + delimTotal + midDelimTotal);
}

let cap;
while ((match = endReg.exec(maskedSrc)) != null) {
cap = this.rules.inline.em.middle.exec(maskedSrc.slice(0, match.index + 2));
if (cap) {
if (Math.min(lLength, rLength) % 2) {
return {
type: 'em',
raw: src.slice(0, cap[0].length),
text: src.slice(1, cap[0].length - 1)
raw: src.slice(0, lLength + match.index + rLength + 1),
text: src.slice(1, lLength + match.index + rLength)
};
}
if (Math.min(lLength, rLength) % 2 === 0) {
return {
type: 'strong',
raw: src.slice(0, lLength + match.index + rLength + 1),
text: src.slice(2, lLength + match.index + rLength - 1)
};
}
}
Expand Down Expand Up @@ -1199,74 +1214,41 @@ const inline = {
reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/,
nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/,
reflinkSearch: 'reflink|nolink(?!\\()',
strong: {
start: /^(?:(\*\*(?=[*punctuation]))|\*\*)(?![\s])|__/, // (1) returns if starts w/ punctuation
middle: /^\*\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*\*$|^__(?![\s])((?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?)__$/,
endAst: /[^punctuation\s]\*\*(?!\*)|[punctuation]\*\*(?!\*)(?:(?=[punctuation_\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline)
endUnd: /[^\s]__(?!_)(?:(?=[punctuation*\s])|$)/ // last char can't be a space, and final _ must preceed punct or \s (or endline)
},
em: {
start: /^(?:(\*(?=[punctuation]))|\*)(?![*\s])|_/, // (1) returns if starts w/ punctuation
middle: /^\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*$|^_(?![_\s])(?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?_$/,
endAst: /[^punctuation\s]\*(?!\*)|[punctuation]\*(?!\*)(?:(?=[punctuation_\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline)
endUnd: /[^\s]_(?!_)(?:(?=[punctuation*\s])|$)/ // last char can't be a space, and final _ must preceed punct or \s (or endline)
emStrong: {
lDelim: /^(?:\*+(?:([punct_])|[^\s*]))|^_+(?:([punct*])|([^\s_]))/,
// (1) and (2) can only be a Right Delimiter. (3) and (4) can only be Left. (5) and (6) can be either Left or Right.
// () Skip other delimiter (1) #*** (2) a***#, a*** (3) #***a, ***a (4) ***# (5) #***# (6) a***a
rDelimAst: /\_\_[^_]*?\*[^_]*?\_\_|[punct_](\*+)(?=[\s]|$)|[^punct*_\s](\*+)(?=[punct_\s]|$)|[punct_\s](\*+)(?=[^punct*_\s])|[\s](\*+)(?=[punct_])|[punct_](\*+)(?=[punct_])|[^punct*_\s](\*+)(?=[^punct*_\s])/,
rDelimUnd: /\*\*[^*]*?\_[^*]*?\*\*|[punct*](\_+)(?=[\s]|$)|[^punct*_\s](\_+)(?=[punct*\s]|$)|[punct*\s](\_+)(?=[^punct*_\s])|[\s](\_+)(?=[punct*])|[punct*](\_+)(?=[punct*])/ // ^- Not allowed for _
},
code: /^(`+)([^`]|[^`][\s\S]*?[^`])\1(?!`)/,
br: /^( {2,}|\\)\n(?!\s*$)/,
del: noopTest$1,
text: /^(`+|[^`])(?:(?= {2,}\n)|[\s\S]*?(?:(?=[\\<!\[`*]|\b_|$)|[^ ](?= {2,}\n)))/,
punctuation: /^([\s*punctuation])/
text: /^(`+|[^`])(?:(?= {2,}\n)|[\s\S]*?(?:(?=[\\<!\[`*_]|\b_|$)|[^ ](?= {2,}\n)))/,
punctuation: /^([\spunctuation])/
};

// list of punctuation marks from common mark spec
// without * and _ to workaround cases with double emphasis
// list of punctuation marks from CommonMark spec
// without * and _ to handle the different emphasis markers * and _
inline._punctuation = '!"#$%&\'()+\\-.,/:;<=>?@\\[\\]`^{|}~';
inline.punctuation = edit$1(inline.punctuation).replace(/punctuation/g, inline._punctuation).getRegex();

// sequences em should skip over [title](link), `code`, <html>
inline._blockSkip = '\\[[^\\]]*?\\]\\([^\\)]*?\\)|`[^`]*?`|<[^>]*?>';
inline._overlapSkip = '__[^_]*?__|\\*\\*\\[^\\*\\]*?\\*\\*';
inline.blockSkip = /\[[^\]]*?\]\([^\)]*?\)|`[^`]*?`|<[^>]*?>/g;
inline.escapedEmSt = /\\\*|\\_/g;

inline._comment = edit$1(block._comment).replace('(?:-->|$)', '-->').getRegex();

inline.em.start = edit$1(inline.em.start)
.replace(/punctuation/g, inline._punctuation)
inline.emStrong.lDelim = edit$1(inline.emStrong.lDelim)
.replace(/punct/g, inline._punctuation)
.getRegex();

inline.em.middle = edit$1(inline.em.middle)
.replace(/punctuation/g, inline._punctuation)
.replace(/overlapSkip/g, inline._overlapSkip)
inline.emStrong.rDelimAst = edit$1(inline.emStrong.rDelimAst, 'g')
.replace(/punct/g, inline._punctuation)
.getRegex();

inline.em.endAst = edit$1(inline.em.endAst, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.em.endUnd = edit$1(inline.em.endUnd, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.strong.start = edit$1(inline.strong.start)
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.strong.middle = edit$1(inline.strong.middle)
.replace(/punctuation/g, inline._punctuation)
.replace(/overlapSkip/g, inline._overlapSkip)
.getRegex();

inline.strong.endAst = edit$1(inline.strong.endAst, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.strong.endUnd = edit$1(inline.strong.endUnd, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.blockSkip = edit$1(inline._blockSkip, 'g')
.getRegex();

inline.overlapSkip = edit$1(inline._overlapSkip, 'g')
inline.emStrong.rDelimUnd = edit$1(inline.emStrong.rDelimUnd, 'g')
.replace(/punct/g, inline._punctuation)
.getRegex();

inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g;
Expand Down Expand Up @@ -1345,7 +1327,7 @@ inline.gfm = merge$1({}, inline.normal, {
url: /^((?:ftp|https?):\/\/|www\.)(?:[a-zA-Z0-9\-]+\.?)+[^\s<]*|^email/,
_backpedal: /(?:[^?!.,:;*_~()&]+|\([^)]*\)|&(?![a-zA-Z0-9]+;$)|[?!.,:;*_~)]+(?!$))+/,
del: /^(~~?)(?=[^\s~])([\s\S]*?[^\s~])\1(?=[^~]|$)/,
text: /^([`~]+|[^`~])(?:(?= {2,}\n)|[\s\S]*?(?:(?=[\\<!\[`*~]|\b_|https?:\/\/|ftp:\/\/|www\.|$)|[^ ](?= {2,}\n)|[^a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-](?=[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]+@))|(?=[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]+@))/
text: /^([`~]+|[^`~])(?:(?= {2,}\n)|[\s\S]*?(?:(?=[\\<!\[`*~_]|\b_|https?:\/\/|ftp:\/\/|www\.|$)|[^ ](?= {2,}\n)|[^a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-](?=[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]+@))|(?=[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]+@))/
});

inline.gfm.url = edit$1(inline.gfm.url, 'i')
Expand Down Expand Up @@ -1720,11 +1702,17 @@ var Lexer_1 = class Lexer {
maskedSrc = maskedSrc.slice(0, match.index) + '[' + repeatString$1('a', match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.blockSkip.lastIndex);
}

// Mask out escaped em & strong delimiters
while ((match = this.tokenizer.rules.inline.escapedEmSt.exec(maskedSrc)) != null) {
maskedSrc = maskedSrc.slice(0, match.index) + '++' + maskedSrc.slice(this.tokenizer.rules.inline.escapedEmSt.lastIndex);
}

while (src) {
if (!keepPrevChar) {
prevChar = '';
}
keepPrevChar = false;

// escape
if (token = this.tokenizer.escape(src)) {
src = src.substring(token.raw.length);
Expand Down Expand Up @@ -1761,16 +1749,8 @@ var Lexer_1 = class Lexer {
continue;
}

// strong
if (token = this.tokenizer.strong(src, maskedSrc, prevChar)) {
src = src.substring(token.raw.length);
token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock);
tokens.push(token);
continue;
}

// em
if (token = this.tokenizer.em(src, maskedSrc, prevChar)) {
// em & strong
if (token = this.tokenizer.emStrong(src, maskedSrc, prevChar)) {
src = src.substring(token.raw.length);
token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock);
tokens.push(token);
Expand Down Expand Up @@ -1816,7 +1796,9 @@ var Lexer_1 = class Lexer {
// text
if (token = this.tokenizer.inlineText(src, inRawBlock, smartypants)) {
src = src.substring(token.raw.length);
prevChar = token.raw.slice(-1);
if (token.raw.slice(-1) !== '_') { // Track prevChar before string of ____ started
prevChar = token.raw.slice(-1);
}
keepPrevChar = true;
tokens.push(token);
continue;
Expand Down
Loading