Skip to content

Commit

Permalink
enh(cpp): Improve highlighting of unterminated raw strings
Browse files Browse the repository at this point in the history
PR #1897 switched C++ raw strings to use backreferences, however this
breaks souce files where raw strings are truncated. Like comments, it
would be preferable to highlight them.

- Add `on:begin` and `on:end` to allow more granular matching when
  then end match is dynamic and based on a part of the begin match
- This deprecates the `endSameAsBegin` attribute. That attribute was
  a very specific way to solve this problem, but now we have a much
  more general solution in these added callbacks.

Also related: #2259.

Co-authored-by: Josh Goebel <me@joshgoebel.com>
  • Loading branch information
davidben and joshgoebel committed Apr 27, 2020
1 parent 0afd0d3 commit 58d9113
Show file tree
Hide file tree
Showing 9 changed files with 82 additions and 27 deletions.
1 change: 1 addition & 0 deletions docs/reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ In this case you can't simply specify the same regexp for ``begin`` and
``end`` (say, ``"\\$[a-z]\\$"``), but you can use ``begin: "\\$[a-z]\\$"``
and ``endSameAsBegin: true``.


.. _lexemes:

lexemes
Expand Down
65 changes: 44 additions & 21 deletions src/highlight.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ https://highlightjs.org/
*/

import deepFreeze from './vendor/deep_freeze';
import Response from './lib/response';
import TokenTreeEmitter from './lib/token_tree';
import * as regex from './lib/regex';
import * as utils from './lib/utils';
Expand Down Expand Up @@ -118,18 +119,6 @@ const HLJS = function(hljs) {
function _highlight(languageName, code, ignoreIllegals, continuation) {
var codeToHighlight = code;

function endOfMode(mode, lexeme) {
if (regex.startsWith(mode.endRe, lexeme)) {
while (mode.endsParent && mode.parent) {
mode = mode.parent;
}
return mode;
}
if (mode.endsWithParent) {
return endOfMode(mode.parent, lexeme);
}
}

function keywordData(mode, match) {
var matchText = language.case_insensitive ? match[0].toLowerCase() : match[0];
return Object.prototype.hasOwnProperty.call(mode.keywords, matchText) && mode.keywords[matchText];
Expand Down Expand Up @@ -206,7 +195,33 @@ const HLJS = function(hljs) {
if (mode.className) {
emitter.openNode(mode.className);
}
top = Object.create(mode, { parent: { value: top } });
top = Object.create(mode, {parent: {value: top}});
return top;
}

function endOfMode(mode, match, matchPlusRemainder) {
let matched = regex.startsWith(mode.endRe, matchPlusRemainder);

if (matched) {
if (mode["before:end"]) {
let resp = new Response(mode);
mode["before:end"](match, resp);
if (resp.ignore)
matched = false;
}

if (matched) {
while (mode.endsParent && mode.parent) {
mode = mode.parent;
}
return mode;
}
}
// even if before:end fires an `ignore` it's still possible
// that we might trigger the end node because of a parent mode
if (mode.endsWithParent) {
return endOfMode(mode.parent, match, matchPlusRemainder);
}
}

function doIgnore(lexeme) {
Expand All @@ -226,12 +241,15 @@ const HLJS = function(hljs) {
function doBeginMatch(match) {
var lexeme = match[0];
var new_mode = match.rule;

if (new_mode.__onBegin) {
const res = new_mode.__onBegin(match) || {};
if (res.ignoreMatch) {
return doIgnore(lexeme);
}
var mode;

let resp = new Response(new_mode);
// first internal before callbacks, then the public ones
let beforeCallbacks = [new_mode.__beforeBegin, new_mode["before:begin"]];
for (let cb of beforeCallbacks) {
if (!cb) continue;
cb(match, resp);
if (resp.ignore) return doIgnore(lexeme);
}

if (new_mode && new_mode.endSameAsBegin) {
Expand All @@ -249,14 +267,19 @@ const HLJS = function(hljs) {
mode_buffer = lexeme;
}
}
startNewMode(new_mode);
mode = startNewMode(new_mode);
if (mode["after:begin"]) {
let resp = new Response(mode);
mode["after:begin"](match, resp);
}
return new_mode.returnBegin ? 0 : lexeme.length;
}

function doEndMatch(match) {
var lexeme = match[0];
var matchPlusRemainder = codeToHighlight.substr(match.index);
var end_mode = endOfMode(top, matchPlusRemainder);

var end_mode = endOfMode(top, match, matchPlusRemainder);
if (!end_mode) { return NO_MATCH; }

var origin = top;
Expand Down
7 changes: 6 additions & 1 deletion src/languages/c-like.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,12 @@ export default function(hljs) {
begin: '(u8?|U|L)?\'(' + CHARACTER_ESCAPES + "|.)", end: '\'',
illegal: '.'
},
{ begin: /(?:u8?|U|L)?R"([^()\\ ]{0,16})\((?:.|\n)*?\)\1"/ }
{
begin: /(?:u8?|U|L)?R"([^()\\ ]{0,16})\(/,
end: /\)([^()\\ ]{0,16})"/,
'after:begin': (m, resp) => { resp.data.heredoc = m[1]; },
'before:end': function(m, resp) { if (resp.data.heredoc !== m[1]) resp.ignoreMatch(); }
}
]
};

Expand Down
11 changes: 6 additions & 5 deletions src/lib/mode_compiler.js
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ export function compileLanguage(language) {
// eslint-disable-next-line no-undefined
const i = match.findIndex((el, i) => i > 0 && el !== undefined);
const matchData = this.matchIndexes[i];
match.splice(0, i); // // trim off the extra matches

return Object.assign(match, matchData);
}
Expand Down Expand Up @@ -158,11 +159,11 @@ export function compileLanguage(language) {
}

// TODO: We need negative look-behind support to do this properly
function skipIfhasPrecedingOrTrailingDot(match) {
function skipIfhasPrecedingOrTrailingDot(match, resp) {
const before = match.input[match.index - 1];
const after = match.input[match.index + match[0].length];
if (before === "." || after === ".") {
return { ignoreMatch: true };
resp.ignoreMatch();
}
}

Expand Down Expand Up @@ -200,8 +201,8 @@ export function compileLanguage(language) {
if (mode.compiled) return;
mode.compiled = true;

// __onBegin is considered private API, internal use only
mode.__onBegin = null;
// __beforeBegin is considered private API, internal use only
mode.__beforeBegin = null;

mode.keywords = mode.keywords || mode.beginKeywords;
if (mode.keywords) {
Expand All @@ -218,7 +219,7 @@ export function compileLanguage(language) {
// doesn't allow spaces in keywords anyways and we still check for the boundary
// first
mode.begin = '\\b(' + mode.beginKeywords.split(' ').join('|') + ')(?=\\b|\\s)';
mode.__onBegin = skipIfhasPrecedingOrTrailingDot;
mode.__beforeBegin = skipIfhasPrecedingOrTrailingDot;
}
if (!mode.begin)
mode.begin = /\B|\b/;
Expand Down
11 changes: 11 additions & 0 deletions src/lib/response.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
export default class Response {
constructor(mode) {
if (mode.data === undefined)
mode.data = {};
this.data = mode.data;
}

ignoreMatch() {
this.ignore = true;
}
}
3 changes: 3 additions & 0 deletions test/markup/cpp/truncated-block-comment.expect.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<span class="hljs-comment">/*
Truncated block comment
</span>
2 changes: 2 additions & 0 deletions test/markup/cpp/truncated-block-comment.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/*
Truncated block comment
5 changes: 5 additions & 0 deletions test/markup/cpp/truncated-raw-string.expect.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<span class="hljs-string">R"foo(
Truncated raw string
)nope"
Still not completed.
</span>
4 changes: 4 additions & 0 deletions test/markup/cpp/truncated-raw-string.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
R"foo(
Truncated raw string
)nope"
Still not completed.

0 comments on commit 58d9113

Please sign in to comment.