From 573174ea3e04504bc39839c3f341a1b2b998d10e Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Fri, 8 Nov 2019 18:20:45 -0500 Subject: [PATCH] cpp: Fix highlighting of unterminated raw strings PR #1897 switched C++ raw strings to use backreferences, however this breaks souce files where raw strings are truncated. Like comments, it would be preferable to highlight them. Instead, go back to using separate begin and end regexps, but introduce an endFilter feature to filter out false positive matches. This internally works similarly to endSameAsBegin. See also issue #2259. --- docs/reference.rst | 22 ++++++++++++++++++- src/highlight.js | 14 +++++++----- src/languages/c-like.js | 11 +++++++++- .../cpp/truncated-block-comment.expect.txt | 3 +++ test/markup/cpp/truncated-block-comment.txt | 2 ++ .../cpp/truncated-raw-string.expect.txt | 5 +++++ test/markup/cpp/truncated-raw-string.txt | 4 ++++ 7 files changed, 54 insertions(+), 7 deletions(-) create mode 100644 test/markup/cpp/truncated-block-comment.expect.txt create mode 100644 test/markup/cpp/truncated-block-comment.txt create mode 100644 test/markup/cpp/truncated-raw-string.expect.txt create mode 100644 test/markup/cpp/truncated-raw-string.txt diff --git a/docs/reference.rst b/docs/reference.rst index 8d5b0b80e6..1f6a690468 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -190,7 +190,7 @@ endSameAsBegin Acts as ``end`` matching exactly the same string that was found by the corresponding ``begin`` regexp. -For example, in PostgreSQL string constants can uee "dollar quotes", +For example, in PostgreSQL string constants can use "dollar quotes", consisting of a dollar sign, an optional tag of zero or more characters, and another dollar sign. String constant must be ended with the same construct using the same tag. It is possible to nest dollar-quoted string @@ -208,6 +208,26 @@ In this case you can't simply specify the same regexp for ``begin`` and ``end`` (say, ``"\\$[a-z]\\$"``), but you can use ``begin: "\\$[a-z]\\$"`` and ``endSameAsBegin: true``. +.. _endFilter: + +endFilter +^^^^^^^^^ + +**type**: function + +Filters ``end`` matches to implement end rules that cannot be expressed as a +standalone regular expression. + +This should be a function which takes two string parameters, the string that +matched the ``begin`` regexp and the string that matched the ``end`` regexp. It +should return true to end the mode and false otherwise. + +For example, C++11 raw string constants use syntax like ``R"tag(.....)tag"``, +where ``tag`` is any zero to sixteen character string that must be repeated at +the end. This could be matched with a single regexp containing backreferences, +but truncated raw strings would not highlight. Instead, ``endFilter`` can be +used to reject ``)tag"`` delimiters which do not match the starting value. + .. _lexemes: lexemes diff --git a/src/highlight.js b/src/highlight.js index 9ce4bd97c3..8e2b1c7607 100644 --- a/src/highlight.js +++ b/src/highlight.js @@ -120,15 +120,19 @@ const HLJS = function(hljs) { function _highlight(languageName, code, ignore_illegals, continuation) { var codeToHighlight = code; - function endOfMode(mode, lexeme) { - if (regex.startsWith(mode.endRe, lexeme)) { + function endOfMode(mode, matchPlusRemainder, lexeme) { + var modeEnded = regex.startsWith(mode.endRe, matchPlusRemainder); + if (modeEnded && mode.endFilter) { + modeEnded = mode.endFilter(mode.beginValue, lexeme); + } + if (modeEnded) { while (mode.endsParent && mode.parent) { mode = mode.parent; } return mode; } if (mode.endsWithParent) { - return endOfMode(mode.parent, lexeme); + return endOfMode(mode.parent, matchPlusRemainder, lexeme); } } @@ -210,7 +214,7 @@ const HLJS = function(hljs) { if (mode.className) { emitter.openNode(mode.className); } - top = Object.create(mode, {parent: {value: top}}); + top = Object.create(mode, {parent: {value: top}, beginValue: {value: lexeme}}); } function doIgnore(lexeme) { @@ -259,7 +263,7 @@ const HLJS = function(hljs) { function doEndMatch(match) { var lexeme = match[0]; var matchPlusRemainder = codeToHighlight.substr(match.index); - var end_mode = endOfMode(top, matchPlusRemainder); + var end_mode = endOfMode(top, matchPlusRemainder, lexeme); if (!end_mode) { return; } var origin = top; diff --git a/src/languages/c-like.js b/src/languages/c-like.js index 90e23073ee..2ba297e7cf 100644 --- a/src/languages/c-like.js +++ b/src/languages/c-like.js @@ -44,7 +44,16 @@ export default function(hljs) { begin: '(u8?|U|L)?\'(' + CHARACTER_ESCAPES + "|.)", end: '\'', illegal: '.' }, - { begin: /(?:u8?|U|L)?R"([^()\\ ]{0,16})\((?:.|\n)*?\)\1"/ } + { + begin: /(?:u8?|U|L)?R"[^()\\ ]{0,16}\(/, + end: /\)[^()\\ ]{0,16}"/, + endFilter: function(begin, end) { + var quote = begin.indexOf('"'); + var beginDelimiter = begin.substring(quote + 1, begin.length - 1); + var endDelimiter = end.substring(1, end.length - 1); + return beginDelimiter == endDelimiter; + }, + } ] }; diff --git a/test/markup/cpp/truncated-block-comment.expect.txt b/test/markup/cpp/truncated-block-comment.expect.txt new file mode 100644 index 0000000000..a2f5ce048a --- /dev/null +++ b/test/markup/cpp/truncated-block-comment.expect.txt @@ -0,0 +1,3 @@ +/* +Truncated block comment + diff --git a/test/markup/cpp/truncated-block-comment.txt b/test/markup/cpp/truncated-block-comment.txt new file mode 100644 index 0000000000..b266bf0806 --- /dev/null +++ b/test/markup/cpp/truncated-block-comment.txt @@ -0,0 +1,2 @@ +/* +Truncated block comment diff --git a/test/markup/cpp/truncated-raw-string.expect.txt b/test/markup/cpp/truncated-raw-string.expect.txt new file mode 100644 index 0000000000..8d133e8bae --- /dev/null +++ b/test/markup/cpp/truncated-raw-string.expect.txt @@ -0,0 +1,5 @@ +R"foo( +Truncated raw string +)nope" +Still not completed. + diff --git a/test/markup/cpp/truncated-raw-string.txt b/test/markup/cpp/truncated-raw-string.txt new file mode 100644 index 0000000000..b012c82bfe --- /dev/null +++ b/test/markup/cpp/truncated-raw-string.txt @@ -0,0 +1,4 @@ +R"foo( +Truncated raw string +)nope" +Still not completed.