Skip to content

Commit

Permalink
cpp: Fix highlighting of unterminated raw strings
Browse files Browse the repository at this point in the history
PR highlightjs#1897 switched C++ raw strings to use backreferences, however this
breaks souce files where raw strings are truncated. Like comments, it
would be preferable to highlight them.

Instead, go back to using separate begin and end regexps, but introduce
an endFilter feature to filter out false positive matches. This
internally works similarly to endSameAsBegin.

See also issue highlightjs#2259.
  • Loading branch information
davidben authored and joshgoebel committed Apr 27, 2020
1 parent 0afd0d3 commit 6cf5ad4
Show file tree
Hide file tree
Showing 9 changed files with 81 additions and 26 deletions.
1 change: 1 addition & 0 deletions docs/reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ In this case you can't simply specify the same regexp for ``begin`` and
``end`` (say, ``"\\$[a-z]\\$"``), but you can use ``begin: "\\$[a-z]\\$"``
and ``endSameAsBegin: true``.


.. _lexemes:

lexemes
Expand Down
63 changes: 43 additions & 20 deletions src/highlight.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ https://highlightjs.org/
*/

import deepFreeze from './vendor/deep_freeze';
import Response from './lib/response';
import TokenTreeEmitter from './lib/token_tree';
import * as regex from './lib/regex';
import * as utils from './lib/utils';
Expand Down Expand Up @@ -118,18 +119,6 @@ const HLJS = function(hljs) {
function _highlight(languageName, code, ignoreIllegals, continuation) {
var codeToHighlight = code;

function endOfMode(mode, lexeme) {
if (regex.startsWith(mode.endRe, lexeme)) {
while (mode.endsParent && mode.parent) {
mode = mode.parent;
}
return mode;
}
if (mode.endsWithParent) {
return endOfMode(mode.parent, lexeme);
}
}

function keywordData(mode, match) {
var matchText = language.case_insensitive ? match[0].toLowerCase() : match[0];
return Object.prototype.hasOwnProperty.call(mode.keywords, matchText) && mode.keywords[matchText];
Expand Down Expand Up @@ -206,7 +195,33 @@ const HLJS = function(hljs) {
if (mode.className) {
emitter.openNode(mode.className);
}
top = Object.create(mode, { parent: { value: top } });
top = Object.create(mode, {parent: {value: top}});
return top;
}

function endOfMode(mode, matchPlusRemainder) {
let matched = regex.startsWith(mode.endRe, matchPlusRemainder);

if (matched) {
if (mode["before:end"]) {
let resp = new Response(mode);
mode["before:end"](match, resp);
if (resp.ignore)
matched = false;
}

if (matched) {
while (mode.endsParent && mode.parent) {
mode = mode.parent;
}
return mode;
}
}
// even if before:end fires an `ignore` it's still possible
// that we might trigger the end node because of a parent mode
if (mode.endsWithParent) {
return endOfMode(mode.parent, matchPlusRemainder);
}
}

function doIgnore(lexeme) {
Expand All @@ -226,12 +241,15 @@ const HLJS = function(hljs) {
function doBeginMatch(match) {
var lexeme = match[0];
var new_mode = match.rule;

if (new_mode.__onBegin) {
const res = new_mode.__onBegin(match) || {};
if (res.ignoreMatch) {
return doIgnore(lexeme);
}
var mode;

let resp = new Response(new_mode);
// first internal before callbacks, then the public ones
let beforeCallbacks = [new_mode.__beforeBegin, new_mode["before:begin"]];
for (let cb of beforeCallbacks) {
if (!cb) continue;
cb(match, resp);
if (resp.ignore) return doIgnore(lexeme);
}

if (new_mode && new_mode.endSameAsBegin) {
Expand All @@ -249,13 +267,18 @@ const HLJS = function(hljs) {
mode_buffer = lexeme;
}
}
startNewMode(new_mode);
mode = startNewMode(new_mode);
if (mode["after:begin"]) {
let resp = new Response(mode);
mode["after:begin"](match, resp);
}
return new_mode.returnBegin ? 0 : lexeme.length;
}

function doEndMatch(match) {
var lexeme = match[0];
var matchPlusRemainder = codeToHighlight.substr(match.index);

var end_mode = endOfMode(top, matchPlusRemainder);
if (!end_mode) { return NO_MATCH; }

Expand Down
7 changes: 6 additions & 1 deletion src/languages/c-like.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,12 @@ export default function(hljs) {
begin: '(u8?|U|L)?\'(' + CHARACTER_ESCAPES + "|.)", end: '\'',
illegal: '.'
},
{ begin: /(?:u8?|U|L)?R"([^()\\ ]{0,16})\((?:.|\n)*?\)\1"/ }
{
begin: /(?:u8?|U|L)?R"([^()\\ ]{0,16})\(/,
end: /\)([^()\\ ]{0,16})"/,
'after:begin': (m, resp) => { resp.data.heredoc = m[1]; },
'before:end': function(m, resp) { if (resp.data.heredoc !== m[1]) resp.ignoreMatch(); }
}
]
};

Expand Down
11 changes: 6 additions & 5 deletions src/lib/mode_compiler.js
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ export function compileLanguage(language) {
// eslint-disable-next-line no-undefined
const i = match.findIndex((el, i) => i > 0 && el !== undefined);
const matchData = this.matchIndexes[i];
match.splice(0, i); // // trim off the extra matches

return Object.assign(match, matchData);
}
Expand Down Expand Up @@ -158,11 +159,11 @@ export function compileLanguage(language) {
}

// TODO: We need negative look-behind support to do this properly
function skipIfhasPrecedingOrTrailingDot(match) {
function skipIfhasPrecedingOrTrailingDot(match, resp) {
const before = match.input[match.index - 1];
const after = match.input[match.index + match[0].length];
if (before === "." || after === ".") {
return { ignoreMatch: true };
resp.ignoreMatch();
}
}

Expand Down Expand Up @@ -200,8 +201,8 @@ export function compileLanguage(language) {
if (mode.compiled) return;
mode.compiled = true;

// __onBegin is considered private API, internal use only
mode.__onBegin = null;
// __beforeBegin is considered private API, internal use only
mode.__beforeBegin = null;

mode.keywords = mode.keywords || mode.beginKeywords;
if (mode.keywords) {
Expand All @@ -218,7 +219,7 @@ export function compileLanguage(language) {
// doesn't allow spaces in keywords anyways and we still check for the boundary
// first
mode.begin = '\\b(' + mode.beginKeywords.split(' ').join('|') + ')(?=\\b|\\s)';
mode.__onBegin = skipIfhasPrecedingOrTrailingDot;
mode.__beforeBegin = skipIfhasPrecedingOrTrailingDot;
}
if (!mode.begin)
mode.begin = /\B|\b/;
Expand Down
11 changes: 11 additions & 0 deletions src/lib/response.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
export default class Response {
constructor(mode) {
if (mode.data === undefined)
mode.data = {};
this.data = mode.data;
}

ignoreMatch() {
this.ignore = true;
}
}
3 changes: 3 additions & 0 deletions test/markup/cpp/truncated-block-comment.expect.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<span class="hljs-comment">/*
Truncated block comment
</span>
2 changes: 2 additions & 0 deletions test/markup/cpp/truncated-block-comment.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/*
Truncated block comment
5 changes: 5 additions & 0 deletions test/markup/cpp/truncated-raw-string.expect.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<span class="hljs-string">R"foo(
Truncated raw string
)nope"
Still not completed.
</span>
4 changes: 4 additions & 0 deletions test/markup/cpp/truncated-raw-string.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
R"foo(
Truncated raw string
)nope"
Still not completed.

0 comments on commit 6cf5ad4

Please sign in to comment.