new sequential parser

- begin matches are matched a single time (they no longer need to be rematched after found) - look-ahead should now work properly for begin matches because of this change - should be a tiny bit faster Before The old parser would build a list of regexes per mode and then combine that into a large regex. This is what was used to scan your code for matches. But after a match was found it had no way on known WHICH match - so it would then have to re-run all the rules sequentially on the bit of match text trying to figure out which rule had matched. The problem is while the original matcher was running agianst the full code this "rematch" was only running aginst the matched text. So look-ahead matches would naturally fail becasue the content they were tryign to look-ahead to was no longer present. After We take the list of regexes per mode and combine then into a larger regex, but with match groups. We keep track of which match group position correspond to which rule. Now when we hit a match we can check which match group was matched and find the associated rule/mode that was matched withotu having to double check. Look-ahead begin matches now "just work" because the rules are always running against the full body of text and not just a subset. Caveats This doesn't solve look-ahead for end matching so naturally it also does nothing for endSameAsBegin. IE, don't expect look-aheads to work properly in those situations yet.
highlightjs · Oct 10, 2019 · 2981f8e · 2981f8e
1 parent 7a94cff
commit 2981f8e
Showing 1 changed file with 165 additions and 77 deletions.
diff --git a/src/highlight.js b/src/highlight.js
@@ -41,7 +41,7 @@ https://highlightjs.org/
       languagePrefixRe = /\blang(?:uage)?-([\w-]+)\b/i,
       fixMarkupRe      = /((^(<[^>]+>|\t|)+|(?:\n)))/gm;
 
-  // The object will be assigned by the build tool. It used to synchronize API 
+  // The object will be assigned by the build tool. It used to synchronize API
   // of external language files with minified version of the highlight.js library.
   var API_REPLACES;
 
@@ -251,8 +251,15 @@ https://highlightjs.org/
       );
     }
 
+    function reCountMatchGroups(re) {
+      return (new RegExp(re.toString() + '|')).exec('').length - 1;
+    }
+
     // joinRe logically computes regexps.join(separator), but fixes the
     // backreferences so they continue to match.
+    // it also places each individual regular expression into it's own
+    // match group, keeping track of the sequencing of those match groups
+    // is currently an exercise for the caller. :-)
     function joinRe(regexps, separator) {
       // backreferenceRe matches an open parenthesis or backreference. To avoid
       // an incorrect parse, it additionally matches the following:
@@ -265,11 +272,13 @@ https://highlightjs.org/
       var numCaptures = 0;
       var ret = '';
       for (var i = 0; i < regexps.length; i++) {
+        numCaptures += 1
         var offset = numCaptures;
         var re = reStr(regexps[i]);
         if (i > 0) {
           ret += separator;
         }
+        ret += "(";
         while (re.length > 0) {
           var match = backreferenceRe.exec(re);
           if (match == null) {
@@ -288,10 +297,73 @@ https://highlightjs.org/
             }
           }
         }
+        ret += ")";
       }
       return ret;
     }
 
+    function buildModeRegex(mode) {
+
+      var matchIndexes = {}
+      var matcherRe;
+      var regexes = [];
+      var matcher = {};
+      var matchAt = 1;
+
+      function addRule(rule, regex) {
+        matchIndexes[matchAt] = rule
+        regexes.push([rule, regex])
+        matchAt += reCountMatchGroups(regex) + 1
+      }
+
+      for (let term of mode.contains) {
+        let re;
+        if (term.beginKeywords) {
+          re = '\\.?(?:' + term.begin + ')\\.?';
+        } else {
+          re = term.begin;
+        }
+        addRule(term, re)
+      }
+      if (mode.terminator_end)
+        addRule("end", mode.terminator_end)
+      if (mode.illegal)
+        addRule("illegal", mode.illegal)
+
+      var terminators = regexes.map(function(el) { return el[1] })
+      matcherRe = langRe(joinRe(terminators, '|'), true)
+
+      matcher.lastIndex = 0
+      matcher.exec = function(s) {
+        var rule;
+
+        if( regexes.length === 0) return null;
+
+        matcherRe.lastIndex = matcher.lastIndex
+        var match = matcherRe.exec(s)
+        if (!match) { return null; }
+
+        for(var i = 0; i<match.length; i++) {
+          if (match[i] != undefined && matchIndexes["" +i] != undefined ) {
+            rule = matchIndexes[""+i];
+            break
+          }
+        }
+
+        // illegal or end match
+        if (typeof rule === "string") {
+          match.type = rule;
+          match.extra = [mode.illegal, mode.terminator_end]
+        } else {
+          match.type = "begin";
+          match.rule = rule;
+        }
+        return match;
+      }
+
+      return matcher
+    }
+
     function compileMode(mode, parent) {
       if (mode.compiled)
         return;
@@ -355,16 +427,9 @@ https://highlightjs.org/
         compileMode(mode.starts, parent);
       }
 
-      var terminators =
-        mode.contains.map(function(c) {
-          return c.beginKeywords ? '\\.?(?:' + c.begin + ')\\.?' : c.begin;
-        })
-        .concat([mode.terminator_end, mode.illegal])
-        .map(reStr)
-        .filter(Boolean);
-      mode.terminators = terminators.length ? langRe(joinRe(terminators, '|'), true) : {exec: function(/*s*/) {return null;}};
+      mode.terminators = buildModeRegex(mode)
     }
-    
+
     compileMode(language);
   }
 
@@ -383,19 +448,6 @@ https://highlightjs.org/
       return new RegExp(value.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'), 'm');
     }
 
-    function subMode(lexeme, mode) {
-      var i, length;
-
-      for (i = 0, length = mode.contains.length; i < length; i++) {
-        if (testRe(mode.contains[i].beginRe, lexeme)) {
-          if (mode.contains[i].endSameAsBegin) {
-            mode.contains[i].endRe = escapeRe( mode.contains[i].beginRe.exec(lexeme)[0] );
-          }
-          return mode.contains[i];
-        }
-      }
-    }
-
     function endOfMode(mode, lexeme) {
       if (testRe(mode.endRe, lexeme)) {
         while (mode.endsParent && mode.parent) {
@@ -408,10 +460,6 @@ https://highlightjs.org/
       }
     }
 
-    function isIllegal(lexeme, mode) {
-      return !ignore_illegals && testRe(mode.illegalRe, lexeme);
-    }
-
     function keywordMatch(mode, match) {
       var match_str = language.case_insensitive ? match[0].toLowerCase() : match[0];
       return mode.keywords.hasOwnProperty(match_str) && mode.keywords[match_str];
@@ -487,74 +535,112 @@ https://highlightjs.org/
       top = Object.create(mode, {parent: {value: top}});
     }
 
-    function processLexeme(buffer, lexeme) {
 
-      mode_buffer += buffer;
+    function doBeginMatch(match) {
+      var lexeme = match[0]
+      var new_mode = match.rule
 
-      if (lexeme == null) {
-        processBuffer();
-        return 0;
+      if (new_mode && new_mode.endSameAsBegin) {
+        new_mode.endRe = escapeRe( lexeme );
       }
 
-      var new_mode = subMode(lexeme, top);
-      if (new_mode) {
-        if (new_mode.skip) {
+      if (new_mode.skip) {
+        mode_buffer += lexeme;
+      } else {
+        if (new_mode.excludeBegin) {
           mode_buffer += lexeme;
-        } else {
-          if (new_mode.excludeBegin) {
-            mode_buffer += lexeme;
-          }
-          processBuffer();
-          if (!new_mode.returnBegin && !new_mode.excludeBegin) {
-            mode_buffer = lexeme;
-          }
         }
-        startNewMode(new_mode, lexeme);
-        return new_mode.returnBegin ? 0 : lexeme.length;
+        processBuffer();
+        if (!new_mode.returnBegin && !new_mode.excludeBegin) {
+          mode_buffer = lexeme;
+        }
       }
+      startNewMode(new_mode, lexeme);
+      return new_mode.returnBegin ? 0 : lexeme.length;
+    }
 
+    function doEndMatch(match) {
+      var lexeme = match[0]
       var end_mode = endOfMode(top, lexeme);
-      if (end_mode) {
-        var origin = top;
-        if (origin.skip) {
+      if (!end_mode) { return }
+
+      var origin = top;
+      if (origin.skip) {
+        mode_buffer += lexeme;
+      } else {
+        if (!(origin.returnEnd || origin.excludeEnd)) {
           mode_buffer += lexeme;
-        } else {
-          if (!(origin.returnEnd || origin.excludeEnd)) {
-            mode_buffer += lexeme;
-          }
-          processBuffer();
-          if (origin.excludeEnd) {
-            mode_buffer = lexeme;
-          }
         }
-        do {
-          if (top.className) {
-            result += spanEndTag;
-          }
-          if (!top.skip && !top.subLanguage) {
-            relevance += top.relevance;
-          }
-          top = top.parent;
-        } while (top !== end_mode.parent);
-        if (end_mode.starts) {
-          if (end_mode.endSameAsBegin) {
-            end_mode.starts.endRe = end_mode.endRe;
-          }
-          startNewMode(end_mode.starts, '');
+        processBuffer();
+        if (origin.excludeEnd) {
+          mode_buffer = lexeme;
         }
-        return origin.returnEnd ? 0 : lexeme.length;
       }
+      do {
+        if (top.className) {
+          result += spanEndTag;
+        }
+        if (!top.skip && !top.subLanguage) {
+          relevance += top.relevance;
+        }
+        top = top.parent;
+      } while (top !== end_mode.parent);
+      if (end_mode.starts) {
+        if (end_mode.endSameAsBegin) {
+          end_mode.starts.endRe = end_mode.endRe;
+        }
+        startNewMode(end_mode.starts, '');
+      }
+      return origin.returnEnd ? 0 : lexeme.length;
+    }
 
-      if (isIllegal(lexeme, top))
+    var lastMatch = {}
+    function processLexeme(text_before_match, match) {
+
+      var lexeme = match && match[0]
+
+      // add non-matched text to the current mode buffer
+      mode_buffer += text_before_match;
+
+      if (lexeme == null) {
+        processBuffer();
+        return 0;
+      }
+
+      // we've found a 0 width match and we're stuck, so we need to advance
+      // this happens when we have badly behaved rules that have optional matchers to the degree that
+      // sometimes they can end up matching nothing at all
+      // Ref: https://github.com/highlightjs/highlight.js/issues/2140
+      if (lastMatch.type=="begin" && match.type=="end" && lastMatch.index == match.index && lexeme === "") {
+        return 1;
+      }
+      lastMatch = match;
+
+      if (match.type==="begin") {
+        return doBeginMatch(match)
+      } else if (match.type==="illegal" && !ignore_illegals) {
+        // illegal match, we do not continue processing
         throw new Error('Illegal lexeme "' + lexeme + '" for mode "' + (top.className || '<unnamed>') + '"');
+      } else if (match.type==="end") {
+        var processed = doEndMatch(match)
+        if (processed != undefined)
+          return processed
+      }
 
       /*
-      Parser should not reach this point as all types of lexemes should be caught
-      earlier, but if it does due to some bug make sure it advances at least one
-      character forward to prevent infinite looping.
+      Why might be find ourselves here?  Only one occasion now.  An end match that was
+      triggered but could not be completed.  When might this happen?  When an `endSameasBegin`
+      rule sets the end rule to a specific match.  Since the overall mode termination rule that's
+      being used to scan the text isn't recompiled that means that any match that LOOKS like
+      the end (but is not, because it is not an exact match to the beginning) will
+      end up here.  A definite end match, but when `doEndMatch` tries to "reapply"
+      the end rule and fails to match, we wind up here, and just silently ignore the end.
+
+      This causes no real harm other than stopping a few times too many.
       */
+
       mode_buffer += lexeme;
-      return lexeme.length || 1;
+      return lexeme.length;
     }
 
     var language = getLanguage(name);
@@ -580,7 +666,7 @@ https://highlightjs.org/
         match = top.terminators.exec(value);
         if (!match)
           break;
-        count = processLexeme(value.substring(index, match.index), match[0]);
+        count = processLexeme(value.substring(index, match.index), match);
         index = match.index + count;
       }
       processLexeme(value.substr(index));
@@ -592,12 +678,14 @@ https://highlightjs.org/
       return {
         relevance: relevance,
         value: result,
+        illegal:false,
         language: name,
         top: top
       };
     } catch (e) {
       if (e.message && e.message.indexOf('Illegal') !== -1) {
         return {
+          illegal: true,
           relevance: 0,
           value: escape(value)
         };