From 2981f8e82609d5797d32acf01e2dcd4dc5a373d6 Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Thu, 10 Oct 2019 11:16:45 -0400 Subject: [PATCH] new sequential parser - begin matches are matched a single time (they no longer need to be rematched after found) - look-ahead should now work properly for begin matches because of this change - should be a tiny bit faster Before The old parser would build a list of regexes per mode and then combine that into a large regex. This is what was used to scan your code for matches. But after a match was found it had no way on known WHICH match - so it would then have to re-run all the rules sequentially on the bit of match text trying to figure out which rule had matched. The problem is while the original matcher was running agianst the full code this "rematch" was only running aginst the matched text. So look-ahead matches would naturally fail becasue the content they were tryign to look-ahead to was no longer present. After We take the list of regexes per mode and combine then into a larger regex, but with match groups. We keep track of which match group position correspond to which rule. Now when we hit a match we can check which match group was matched and find the associated rule/mode that was matched withotu having to double check. Look-ahead begin matches now "just work" because the rules are always running against the full body of text and not just a subset. Caveats This doesn't solve look-ahead for end matching so naturally it also does nothing for endSameAsBegin. IE, don't expect look-aheads to work properly in those situations yet. --- src/highlight.js | 242 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 165 insertions(+), 77 deletions(-) diff --git a/src/highlight.js b/src/highlight.js index ec33008187..f00ac98713 100644 --- a/src/highlight.js +++ b/src/highlight.js @@ -41,7 +41,7 @@ https://highlightjs.org/ languagePrefixRe = /\blang(?:uage)?-([\w-]+)\b/i, fixMarkupRe = /((^(<[^>]+>|\t|)+|(?:\n)))/gm; - // The object will be assigned by the build tool. It used to synchronize API + // The object will be assigned by the build tool. It used to synchronize API // of external language files with minified version of the highlight.js library. var API_REPLACES; @@ -251,8 +251,15 @@ https://highlightjs.org/ ); } + function reCountMatchGroups(re) { + return (new RegExp(re.toString() + '|')).exec('').length - 1; + } + // joinRe logically computes regexps.join(separator), but fixes the // backreferences so they continue to match. + // it also places each individual regular expression into it's own + // match group, keeping track of the sequencing of those match groups + // is currently an exercise for the caller. :-) function joinRe(regexps, separator) { // backreferenceRe matches an open parenthesis or backreference. To avoid // an incorrect parse, it additionally matches the following: @@ -265,11 +272,13 @@ https://highlightjs.org/ var numCaptures = 0; var ret = ''; for (var i = 0; i < regexps.length; i++) { + numCaptures += 1 var offset = numCaptures; var re = reStr(regexps[i]); if (i > 0) { ret += separator; } + ret += "("; while (re.length > 0) { var match = backreferenceRe.exec(re); if (match == null) { @@ -288,10 +297,73 @@ https://highlightjs.org/ } } } + ret += ")"; } return ret; } + function buildModeRegex(mode) { + + var matchIndexes = {} + var matcherRe; + var regexes = []; + var matcher = {}; + var matchAt = 1; + + function addRule(rule, regex) { + matchIndexes[matchAt] = rule + regexes.push([rule, regex]) + matchAt += reCountMatchGroups(regex) + 1 + } + + for (let term of mode.contains) { + let re; + if (term.beginKeywords) { + re = '\\.?(?:' + term.begin + ')\\.?'; + } else { + re = term.begin; + } + addRule(term, re) + } + if (mode.terminator_end) + addRule("end", mode.terminator_end) + if (mode.illegal) + addRule("illegal", mode.illegal) + + var terminators = regexes.map(function(el) { return el[1] }) + matcherRe = langRe(joinRe(terminators, '|'), true) + + matcher.lastIndex = 0 + matcher.exec = function(s) { + var rule; + + if( regexes.length === 0) return null; + + matcherRe.lastIndex = matcher.lastIndex + var match = matcherRe.exec(s) + if (!match) { return null; } + + for(var i = 0; i') + '"'); + } else if (match.type==="end") { + var processed = doEndMatch(match) + if (processed != undefined) + return processed + } /* - Parser should not reach this point as all types of lexemes should be caught - earlier, but if it does due to some bug make sure it advances at least one - character forward to prevent infinite looping. + Why might be find ourselves here? Only one occasion now. An end match that was + triggered but could not be completed. When might this happen? When an `endSameasBegin` + rule sets the end rule to a specific match. Since the overall mode termination rule that's + being used to scan the text isn't recompiled that means that any match that LOOKS like + the end (but is not, because it is not an exact match to the beginning) will + end up here. A definite end match, but when `doEndMatch` tries to "reapply" + the end rule and fails to match, we wind up here, and just silently ignore the end. + + This causes no real harm other than stopping a few times too many. */ + mode_buffer += lexeme; - return lexeme.length || 1; + return lexeme.length; } var language = getLanguage(name); @@ -580,7 +666,7 @@ https://highlightjs.org/ match = top.terminators.exec(value); if (!match) break; - count = processLexeme(value.substring(index, match.index), match[0]); + count = processLexeme(value.substring(index, match.index), match); index = match.index + count; } processLexeme(value.substr(index)); @@ -592,12 +678,14 @@ https://highlightjs.org/ return { relevance: relevance, value: result, + illegal:false, language: name, top: top }; } catch (e) { if (e.message && e.message.indexOf('Illegal') !== -1) { return { + illegal: true, relevance: 0, value: escape(value) };