Skip to content

Commit

Permalink
new sequential parser
Browse files Browse the repository at this point in the history
- begin matches are matched a single time
  (they no longer need to be rematched after found)
- look-ahead should now work properly for begin matches
  because of this change
- should be a tiny bit faster

Before

The old parser would build a list of regexes per mode and then
combine that into a large regex.  This is what was used to scan
your code for matches.  But after a match was found it had no way
on known WHICH match - so it would then have to re-run all the rules
sequentially on the bit of match text trying to figure out which
rule had matched.

The problem is while the original matcher was running agianst the
full code this "rematch" was only running aginst the matched text.
So look-ahead matches would naturally fail becasue the content they
were tryign to look-ahead to was no longer present.

After

We take the list of regexes per mode and combine then into a larger
regex, but with match groups.  We keep track of which match group
position correspond to which rule.  Now when we hit a match we can
check which match group was matched and find the associated rule/mode
that was matched withotu having to double check.

Look-ahead begin matches now "just work" because the rules are always
running against the full body of text and not just a subset.

Caveats

This doesn't solve look-ahead for end matching so naturally it also
does nothing for endSameAsBegin. IE, don't expect look-aheads to work
properly in those situations yet.
  • Loading branch information
joshgoebel committed Oct 10, 2019
1 parent 7a94cff commit 2981f8e
Showing 1 changed file with 165 additions and 77 deletions.
242 changes: 165 additions & 77 deletions src/highlight.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ https://highlightjs.org/
languagePrefixRe = /\blang(?:uage)?-([\w-]+)\b/i,
fixMarkupRe = /((^(<[^>]+>|\t|)+|(?:\n)))/gm;

// The object will be assigned by the build tool. It used to synchronize API
// The object will be assigned by the build tool. It used to synchronize API
// of external language files with minified version of the highlight.js library.
var API_REPLACES;

Expand Down Expand Up @@ -251,8 +251,15 @@ https://highlightjs.org/
);
}

function reCountMatchGroups(re) {
return (new RegExp(re.toString() + '|')).exec('').length - 1;
}

// joinRe logically computes regexps.join(separator), but fixes the
// backreferences so they continue to match.
// it also places each individual regular expression into it's own
// match group, keeping track of the sequencing of those match groups
// is currently an exercise for the caller. :-)
function joinRe(regexps, separator) {
// backreferenceRe matches an open parenthesis or backreference. To avoid
// an incorrect parse, it additionally matches the following:
Expand All @@ -265,11 +272,13 @@ https://highlightjs.org/
var numCaptures = 0;
var ret = '';
for (var i = 0; i < regexps.length; i++) {
numCaptures += 1
var offset = numCaptures;
var re = reStr(regexps[i]);
if (i > 0) {
ret += separator;
}
ret += "(";
while (re.length > 0) {
var match = backreferenceRe.exec(re);
if (match == null) {
Expand All @@ -288,10 +297,73 @@ https://highlightjs.org/
}
}
}
ret += ")";
}
return ret;
}

function buildModeRegex(mode) {

var matchIndexes = {}
var matcherRe;
var regexes = [];
var matcher = {};
var matchAt = 1;

function addRule(rule, regex) {
matchIndexes[matchAt] = rule
regexes.push([rule, regex])
matchAt += reCountMatchGroups(regex) + 1
}

for (let term of mode.contains) {
let re;
if (term.beginKeywords) {
re = '\\.?(?:' + term.begin + ')\\.?';
} else {
re = term.begin;
}
addRule(term, re)
}
if (mode.terminator_end)
addRule("end", mode.terminator_end)
if (mode.illegal)
addRule("illegal", mode.illegal)

var terminators = regexes.map(function(el) { return el[1] })
matcherRe = langRe(joinRe(terminators, '|'), true)

matcher.lastIndex = 0
matcher.exec = function(s) {
var rule;

if( regexes.length === 0) return null;

matcherRe.lastIndex = matcher.lastIndex
var match = matcherRe.exec(s)
if (!match) { return null; }

for(var i = 0; i<match.length; i++) {
if (match[i] != undefined && matchIndexes["" +i] != undefined ) {
rule = matchIndexes[""+i];
break
}
}

// illegal or end match
if (typeof rule === "string") {
match.type = rule;
match.extra = [mode.illegal, mode.terminator_end]
} else {
match.type = "begin";
match.rule = rule;
}
return match;
}

return matcher
}

function compileMode(mode, parent) {
if (mode.compiled)
return;
Expand Down Expand Up @@ -355,16 +427,9 @@ https://highlightjs.org/
compileMode(mode.starts, parent);
}

var terminators =
mode.contains.map(function(c) {
return c.beginKeywords ? '\\.?(?:' + c.begin + ')\\.?' : c.begin;
})
.concat([mode.terminator_end, mode.illegal])
.map(reStr)
.filter(Boolean);
mode.terminators = terminators.length ? langRe(joinRe(terminators, '|'), true) : {exec: function(/*s*/) {return null;}};
mode.terminators = buildModeRegex(mode)
}

compileMode(language);
}

Expand All @@ -383,19 +448,6 @@ https://highlightjs.org/
return new RegExp(value.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'), 'm');
}

function subMode(lexeme, mode) {
var i, length;

for (i = 0, length = mode.contains.length; i < length; i++) {
if (testRe(mode.contains[i].beginRe, lexeme)) {
if (mode.contains[i].endSameAsBegin) {
mode.contains[i].endRe = escapeRe( mode.contains[i].beginRe.exec(lexeme)[0] );
}
return mode.contains[i];
}
}
}

function endOfMode(mode, lexeme) {
if (testRe(mode.endRe, lexeme)) {
while (mode.endsParent && mode.parent) {
Expand All @@ -408,10 +460,6 @@ https://highlightjs.org/
}
}

function isIllegal(lexeme, mode) {
return !ignore_illegals && testRe(mode.illegalRe, lexeme);
}

function keywordMatch(mode, match) {
var match_str = language.case_insensitive ? match[0].toLowerCase() : match[0];
return mode.keywords.hasOwnProperty(match_str) && mode.keywords[match_str];
Expand Down Expand Up @@ -487,74 +535,112 @@ https://highlightjs.org/
top = Object.create(mode, {parent: {value: top}});
}

function processLexeme(buffer, lexeme) {

mode_buffer += buffer;
function doBeginMatch(match) {
var lexeme = match[0]
var new_mode = match.rule

if (lexeme == null) {
processBuffer();
return 0;
if (new_mode && new_mode.endSameAsBegin) {
new_mode.endRe = escapeRe( lexeme );
}

var new_mode = subMode(lexeme, top);
if (new_mode) {
if (new_mode.skip) {
if (new_mode.skip) {
mode_buffer += lexeme;
} else {
if (new_mode.excludeBegin) {
mode_buffer += lexeme;
} else {
if (new_mode.excludeBegin) {
mode_buffer += lexeme;
}
processBuffer();
if (!new_mode.returnBegin && !new_mode.excludeBegin) {
mode_buffer = lexeme;
}
}
startNewMode(new_mode, lexeme);
return new_mode.returnBegin ? 0 : lexeme.length;
processBuffer();
if (!new_mode.returnBegin && !new_mode.excludeBegin) {
mode_buffer = lexeme;
}
}
startNewMode(new_mode, lexeme);
return new_mode.returnBegin ? 0 : lexeme.length;
}

function doEndMatch(match) {
var lexeme = match[0]
var end_mode = endOfMode(top, lexeme);
if (end_mode) {
var origin = top;
if (origin.skip) {
if (!end_mode) { return }

var origin = top;
if (origin.skip) {
mode_buffer += lexeme;
} else {
if (!(origin.returnEnd || origin.excludeEnd)) {
mode_buffer += lexeme;
} else {
if (!(origin.returnEnd || origin.excludeEnd)) {
mode_buffer += lexeme;
}
processBuffer();
if (origin.excludeEnd) {
mode_buffer = lexeme;
}
}
do {
if (top.className) {
result += spanEndTag;
}
if (!top.skip && !top.subLanguage) {
relevance += top.relevance;
}
top = top.parent;
} while (top !== end_mode.parent);
if (end_mode.starts) {
if (end_mode.endSameAsBegin) {
end_mode.starts.endRe = end_mode.endRe;
}
startNewMode(end_mode.starts, '');
processBuffer();
if (origin.excludeEnd) {
mode_buffer = lexeme;
}
return origin.returnEnd ? 0 : lexeme.length;
}
do {
if (top.className) {
result += spanEndTag;
}
if (!top.skip && !top.subLanguage) {
relevance += top.relevance;
}
top = top.parent;
} while (top !== end_mode.parent);
if (end_mode.starts) {
if (end_mode.endSameAsBegin) {
end_mode.starts.endRe = end_mode.endRe;
}
startNewMode(end_mode.starts, '');
}
return origin.returnEnd ? 0 : lexeme.length;
}

if (isIllegal(lexeme, top))
var lastMatch = {}
function processLexeme(text_before_match, match) {

var lexeme = match && match[0]

// add non-matched text to the current mode buffer
mode_buffer += text_before_match;

if (lexeme == null) {
processBuffer();
return 0;
}

// we've found a 0 width match and we're stuck, so we need to advance
// this happens when we have badly behaved rules that have optional matchers to the degree that
// sometimes they can end up matching nothing at all
// Ref: https://github.com/highlightjs/highlight.js/issues/2140
if (lastMatch.type=="begin" && match.type=="end" && lastMatch.index == match.index && lexeme === "") {
return 1;
}
lastMatch = match;

if (match.type==="begin") {
return doBeginMatch(match)
} else if (match.type==="illegal" && !ignore_illegals) {
// illegal match, we do not continue processing
throw new Error('Illegal lexeme "' + lexeme + '" for mode "' + (top.className || '<unnamed>') + '"');
} else if (match.type==="end") {
var processed = doEndMatch(match)
if (processed != undefined)
return processed
}

/*
Parser should not reach this point as all types of lexemes should be caught
earlier, but if it does due to some bug make sure it advances at least one
character forward to prevent infinite looping.
Why might be find ourselves here? Only one occasion now. An end match that was
triggered but could not be completed. When might this happen? When an `endSameasBegin`
rule sets the end rule to a specific match. Since the overall mode termination rule that's
being used to scan the text isn't recompiled that means that any match that LOOKS like
the end (but is not, because it is not an exact match to the beginning) will
end up here. A definite end match, but when `doEndMatch` tries to "reapply"
the end rule and fails to match, we wind up here, and just silently ignore the end.
This causes no real harm other than stopping a few times too many.
*/

mode_buffer += lexeme;
return lexeme.length || 1;
return lexeme.length;
}

var language = getLanguage(name);
Expand All @@ -580,7 +666,7 @@ https://highlightjs.org/
match = top.terminators.exec(value);
if (!match)
break;
count = processLexeme(value.substring(index, match.index), match[0]);
count = processLexeme(value.substring(index, match.index), match);
index = match.index + count;
}
processLexeme(value.substr(index));
Expand All @@ -592,12 +678,14 @@ https://highlightjs.org/
return {
relevance: relevance,
value: result,
illegal:false,
language: name,
top: top
};
} catch (e) {
if (e.message && e.message.indexOf('Illegal') !== -1) {
return {
illegal: true,
relevance: 0,
value: escape(value)
};
Expand Down

0 comments on commit 2981f8e

Please sign in to comment.