diff --git a/src/highlight.js b/src/highlight.js index ad0bb19815..8815c80c80 100644 --- a/src/highlight.js +++ b/src/highlight.js @@ -237,6 +237,47 @@ https://highlightjs.org/ ); } + // joinRe logically computes regexps.join(separator), but fixes the + // backreferences so they continue to match. + function joinRe(regexps, separator) { + // backreferenceRe matches an open parenthesis or backreference. To avoid + // an incorrect parse, it additionally matches the following: + // - [...] elements, where the meaning of parentheses and escapes change + // - other escape sequences, so we do not misparse escape sequences as + // interesting elements + // - non-matching or lookahead parentheses, which do not capture. These + // follow the '(' with a '?'. + var backreferenceRe = /\[(?:[^\\\]]|\\.)*\]|\(\??|\\([1-9][0-9]*)|\\./; + var numCaptures = 0; + var ret = ''; + for (var i = 0; i < regexps.length; i++) { + var offset = numCaptures; + var re = reStr(regexps[i]); + if (i > 0) { + ret += separator; + } + while (re.length > 0) { + var match = backreferenceRe.exec(re); + if (match == null) { + ret += re; + break; + } + ret += re.substring(0, match.index); + re = re.substring(match.index + match[0].length); + if (match[0][0] == '\\' && match[1]) { + // Adjust the backreference. + ret += '\\' + String(Number(match[1]) + offset); + } else { + ret += match[0]; + if (match[0] == '(') { + numCaptures++; + } + } + } + } + return ret; + } + function compileMode(mode, parent) { if (mode.compiled) return; @@ -302,12 +343,12 @@ https://highlightjs.org/ var terminators = mode.contains.map(function(c) { - return c.beginKeywords ? '\\.?(' + c.begin + ')\\.?' : c.begin; + return c.beginKeywords ? '\\.?(?:' + c.begin + ')\\.?' : c.begin; }) .concat([mode.terminator_end, mode.illegal]) .map(reStr) .filter(Boolean); - mode.terminators = terminators.length ? langRe(terminators.join('|'), true) : {exec: function(/*s*/) {return null;}}; + mode.terminators = terminators.length ? langRe(joinRe(terminators, '|'), true) : {exec: function(/*s*/) {return null;}}; } compileMode(language); diff --git a/src/languages/cpp.js b/src/languages/cpp.js index f91e3f4a61..7817d6da09 100644 --- a/src/languages/cpp.js +++ b/src/languages/cpp.js @@ -19,13 +19,7 @@ function(hljs) { illegal: '\\n', contains: [hljs.BACKSLASH_ESCAPE] }, - { - // TODO: This does not handle raw string literals with prefixes. Using - // a single regex with backreferences would work (note to use *? - // instead of * to make it non-greedy), but the mode.terminators - // computation in highlight.js breaks the counting. - begin: '(u8?|U|L)?R"\\(', end: '\\)"', - }, + { begin: /(?:u8?|U|L)?R"([^()\\ ]{0,16})\((?:.|\n)*?\)\1"/ }, { begin: '\'\\\\?.', end: '\'', illegal: '.' diff --git a/test/markup/cpp/string-literals.expect.txt b/test/markup/cpp/string-literals.expect.txt index 1210b01e5e..4c5baca14f 100644 --- a/test/markup/cpp/string-literals.expect.txt +++ b/test/markup/cpp/string-literals.expect.txt @@ -10,21 +10,47 @@ // Raw string literals (multiline) auto char_multi = R"(Hello "normal" -muliline +multiline string.)"; auto utf8_multi = u8R"(Hello "utf-8" -muliline +multiline string)"; auto utf16_multi = uR"(Hello "utf-16" -muliline +multiline string)"; auto utf32_multi = UR"(Hello "utf-32" -muliline +multiline string)"; +// Raw string literals with delimiter (multiline) +auto char_multi = R"blah1(Hello +"normal" +multiline +)" +)blah" +string.)blah1"; +auto utf8_multi = u8R"blah2(Hello +"utf-8" +multiline +)" +)blah" +string)blah2"; +auto utf16_multi = uR"blah3(Hello +"utf-16" +multiline +)" +)blah" +string)blah3"; +auto utf32_multi = UR"blah4(Hello +"utf-32" +multiline +)" +)blah" +string)blah4"; + // Meta strings #include <stdio> #include "lib.h" diff --git a/test/markup/cpp/string-literals.txt b/test/markup/cpp/string-literals.txt index 68b8bd411a..9939edba3e 100644 --- a/test/markup/cpp/string-literals.txt +++ b/test/markup/cpp/string-literals.txt @@ -10,21 +10,47 @@ auto wide_char = L"Hello wchar_t string"; // Raw string literals (multiline) auto char_multi = R"(Hello "normal" -muliline +multiline string.)"; auto utf8_multi = u8R"(Hello "utf-8" -muliline +multiline string)"; auto utf16_multi = uR"(Hello "utf-16" -muliline +multiline string)"; auto utf32_multi = UR"(Hello "utf-32" -muliline +multiline string)"; +// Raw string literals with delimiter (multiline) +auto char_multi = R"blah1(Hello +"normal" +multiline +)" +)blah" +string.)blah1"; +auto utf8_multi = u8R"blah2(Hello +"utf-8" +multiline +)" +)blah" +string)blah2"; +auto utf16_multi = uR"blah3(Hello +"utf-16" +multiline +)" +)blah" +string)blah3"; +auto utf32_multi = UR"blah4(Hello +"utf-32" +multiline +)" +)blah" +string)blah4"; + // Meta strings #include #include "lib.h"