Skip to content

Commit

Permalink
Fix source positions for inlines
Browse files Browse the repository at this point in the history
Applied patch from commonmark/cmark#298
  • Loading branch information
Sandra Tatarevićová committed Apr 8, 2020
1 parent beecac3 commit 8e1a5b8
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[submodule "ext/commonmarker/cmark-upstream"]
path = ext/commonmarker/cmark-upstream
url = https://github.com/github/cmark-gfm.git
url = https://github.com/orchitech/cmark-gfm.git
ignore = dirty
72 changes: 70 additions & 2 deletions ext/commonmarker/blocks.c
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,37 @@ static void add_line(cmark_node *node, cmark_chunk *ch, cmark_parser *parser) {
cmark_strbuf_putc(&node->content, ' ');
}
}

// If inserting the initial line to the node...
if (node->content.size == 0
// OR the node is a code block...
|| node->type == CMARK_NODE_CODE_BLOCK
// OR the node is a HTML block.
|| node->type == CMARK_NODE_HTML_BLOCK) {

// Then do not insert the leading trivia.
cmark_strbuf_put(&node->content, ch->data + parser->offset,
ch->len - parser->offset);
} else {
// Special case for maintaining the source position of block quotes
// as they can be lazy (i.e. the block quote marker can be omitted).
//
// The simple solution is to replace any block quote markers (">")
// present in the leading trivia with whitespace.
//
// Note: Using `parser->offset` and not `parser->first_nonspace`
// because the latter encompasses the former with the addition of
// whitespace (which we are not interested in).
assert(parser->offset <= parser->first_nonspace);
for (int i = 0; i < parser->offset; i++) {
if (peek_at(ch, i) == '>')
ch->data[i] = ' ';
}

// Otherwise, do not remove leading trivia for appends (i.e. lines
// other than the first).
cmark_strbuf_put(&node->content, ch->data, ch->len);
}
}

static void remove_trailing_blank_lines(cmark_strbuf *ln) {
Expand Down Expand Up @@ -266,6 +295,12 @@ static bool resolve_reference_link_definitions(

chunk.data += pos;
chunk.len -= pos;

// Leading whitespace is not stripped.
while (cmark_isspace(peek_at(&chunk, 0))) {
chunk.data += 1;
chunk.len -= 1;
}
}
cmark_strbuf_drop(node_content, (node_content->size - chunk.len));
return !is_blank(&b->content, 0);
Expand All @@ -283,13 +318,33 @@ static cmark_node *finalize(cmark_parser *parser, cmark_node *b) {
CMARK_NODE__OPEN); // shouldn't call finalize on closed blocks
b->flags &= ~CMARK_NODE__OPEN;

if (parser->curline.size == 0) {
if (S_type(b) == CMARK_NODE_THEMATIC_BREAK) {
// Already been "finalized".
return parent;
}

if (S_type(b) == CMARK_NODE_HEADING && !b->as.heading.setext) {
parser->last_line_length += b->end_column;
}

if ((S_type(b) == CMARK_NODE_ITEM || S_type(b) == CMARK_NODE_LIST)
&& b->last_child) {
b->end_line = b->last_child->end_line;
b->end_column = b->last_child->end_column;

if (S_type(b) == CMARK_NODE_ITEM && b->parent) {
// The finalization order is not deterministic...
b->parent->end_line = b->end_line;
b->parent->end_column = b->end_column;
}
} else if (parser->curline.size == 0) {
// end of input - line number has not been incremented
b->end_line = parser->line_number;
b->end_column = parser->last_line_length;
} else if (S_type(b) == CMARK_NODE_DOCUMENT ||
(S_type(b) == CMARK_NODE_CODE_BLOCK && b->as.code.fenced) ||
(S_type(b) == CMARK_NODE_HEADING && b->as.heading.setext)) {
(S_type(b) == CMARK_NODE_HTML_BLOCK
&& b->end_line == b->start_line && b->end_column == 0)) {
b->end_line = parser->line_number;
b->end_column = parser->curline.size;
if (b->end_column && parser->curline.ptr[b->end_column - 1] == '\n')
Expand Down Expand Up @@ -1181,6 +1236,10 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container,
// it's only now that we know the line is not part of a setext heading:
*container = add_child(parser, *container, CMARK_NODE_THEMATIC_BREAK,
parser->first_nonspace + 1);
// A thematic break can only be on a single line, so we can set the
// end source position here.
(*container)->end_line = parser->line_number;
(*container)->end_column = input->len - 1;
S_advance_offset(parser, input, input->len - 1 - parser->offset, false);
} else if (!indented &&
parser->options & CMARK_OPT_FOOTNOTES &&
Expand Down Expand Up @@ -1342,6 +1401,12 @@ static void add_text_to_container(cmark_parser *parser, cmark_node *container,
} else { // not a lazy continuation
// Finalize any blocks that were not matched and set cur to container:
while (parser->current != last_matched_container) {
if (S_type(parser->current) == CMARK_NODE_HTML_BLOCK) {
// Edge case: Closing an HTML block without a matching end condition.
parser->current->end_line = parser->line_number - 1;
parser->current->end_column = parser->last_line_length;
}

parser->current = finalize(parser, parser->current);
assert(parser->current != NULL);
}
Expand Down Expand Up @@ -1392,7 +1457,10 @@ static void add_text_to_container(cmark_parser *parser, cmark_node *container,
} else if (accepts_lines(S_type(container))) {
if (S_type(container) == CMARK_NODE_HEADING &&
container->as.heading.setext == false) {
bufsize_t original_len = input->len;
chop_trailing_hashtags(input);
// Substract one to exclude the trailing newline.
container->end_column += original_len - input->len - 1;
}
S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
false);
Expand Down
5 changes: 5 additions & 0 deletions ext/commonmarker/buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,11 @@ void cmark_strbuf_trim(cmark_strbuf *buf) {
cmark_strbuf_rtrim(buf);
}

void cmark_strbuf_remove(cmark_strbuf *buf, bufsize_t start_offset, bufsize_t len) {
memmove(buf->ptr + start_offset, buf->ptr + start_offset + len, buf->size - (start_offset + len));
buf->size -= len;
}

// Destructively modify string, collapsing consecutive
// space and newline characters into a single space.
void cmark_strbuf_normalize_whitespace(cmark_strbuf *s) {
Expand Down
10 changes: 10 additions & 0 deletions ext/commonmarker/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,16 @@ void cmark_strbuf_rtrim(cmark_strbuf *buf);
CMARK_GFM_EXPORT
void cmark_strbuf_trim(cmark_strbuf *buf);

/**
Removes the characters in the given range.
@param buf The string buffer.
@param start_offset The starting character offset.
@param len The length of characters to remove.
*/
CMARK_GFM_EXPORT
void cmark_strbuf_remove(cmark_strbuf *buf, bufsize_t start_offset, bufsize_t len);

CMARK_GFM_EXPORT
void cmark_strbuf_normalize_whitespace(cmark_strbuf *s);

Expand Down
2 changes: 1 addition & 1 deletion ext/commonmarker/cmark-upstream
76 changes: 70 additions & 6 deletions ext/commonmarker/inlines.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ static CMARK_INLINE cmark_node *make_autolink(subject *subj,
link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email);
link->as.link.title = cmark_chunk_literal("");
link->start_line = link->end_line = subj->line;
link->start_column = start_column + 1;
link->end_column = end_column + 1;
link->start_column = subj->column_offset + subj->block_offset + start_column + 1;
link->end_column = subj->column_offset + subj->block_offset + end_column + 1;
cmark_node_append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url));
return link;
}
Expand Down Expand Up @@ -325,10 +325,10 @@ static bufsize_t scan_to_closing_backticks(subject *subj,
// spaces, then removing a single leading + trailing space,
// unless the code span consists entirely of space characters.
static void S_normalize_code(cmark_strbuf *s) {
bufsize_t r, w;
bufsize_t r, w, last_char_after_nl;
bool contains_nonspace = false;

for (r = 0, w = 0; r < s->size; ++r) {
for (r = 0, w = 0, last_char_after_nl = 0; r < s->size; ++r) {
switch (s->ptr[r]) {
case '\r':
if (s->ptr[r + 1] != '\n') {
Expand All @@ -337,15 +337,46 @@ static void S_normalize_code(cmark_strbuf *s) {
break;
case '\n':
s->ptr[w++] = ' ';
last_char_after_nl = w;
break;
case ' ':
s->ptr[w++] = s->ptr[r];
break;
default:
if (last_char_after_nl) {
// Remove leading whitespace.
bufsize_t remove_len = r - last_char_after_nl;

if (remove_len) {
cmark_strbuf_remove(s, last_char_after_nl, remove_len);
w -= remove_len;
r -= remove_len;
}

last_char_after_nl = 0;
}

s->ptr[w++] = s->ptr[r];
}
if (s->ptr[r] != ' ') {
contains_nonspace = true;
}
}

if (last_char_after_nl) {
// Remove leading whitespace. Only reach here if the closing backquote
// delimiter is on its own line.
bufsize_t remove_len = r - last_char_after_nl;

if (remove_len) {
cmark_strbuf_remove(s, last_char_after_nl, remove_len);
w -= remove_len;
r -= remove_len;
}

last_char_after_nl = 0;
}

// begins and ends with space?
if (contains_nonspace &&
s->ptr[0] == ' ' && s->ptr[w - 1] == ' ') {
Expand All @@ -361,13 +392,15 @@ static void S_normalize_code(cmark_strbuf *s) {
// Parse backtick code section or raw backticks, return an inline.
// Assumes that the subject has a backtick at the current position.
static cmark_node *handle_backticks(subject *subj, int options) {
// Save the current source position in case of need to rewind.
bufsize_t subjpos = subj->pos;
cmark_chunk openticks = take_while(subj, isbacktick);
bufsize_t startpos = subj->pos;
bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len);

if (endpos == 0) { // not found
subj->pos = startpos; // rewind
return make_str(subj, subj->pos, subj->pos, openticks);
return make_str(subj, subjpos, subjpos, openticks);
} else {
cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);

Expand Down Expand Up @@ -797,6 +830,10 @@ static cmark_node *handle_backslash(cmark_parser *parser, subject *subj) {
advance(subj);
return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1));
} else if (!is_eof(subj) && skip_line_end(subj)) {
// Adjust the subject source position state.
++subj->line;
subj->column_offset = -subj->pos;

return make_linebreak(subj->mem);
} else {
return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\"));
Expand Down Expand Up @@ -1163,7 +1200,8 @@ static cmark_node *handle_close_bracket(cmark_parser *parser, subject *subj) {
inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK);
inl->as.link.url = url;
inl->as.link.title = title;
inl->start_line = inl->end_line = subj->line;
inl->start_line = opener->inl_text->start_line;
inl->end_line = subj->line;
inl->start_column = opener->inl_text->start_column;
inl->end_column = subj->pos + subj->column_offset + subj->block_offset;
cmark_node_insert_before(opener->inl_text, inl);
Expand Down Expand Up @@ -1304,10 +1342,21 @@ static int parse_inline(cmark_parser *parser, subject *subj, cmark_node *parent,
cmark_chunk contents;
unsigned char c;
bufsize_t startpos, endpos;
int saved_block_offset = subj->block_offset;

c = peek_char(subj);
if (c == 0) {
return 0;
}

// If NOT the subject's initial line...
if (subj->column_offset != 0) {
// Reset the block offset. The line's leading trivia was not trimmed,
// so the source position will be computed appropriately without the
// block offset.
subj->block_offset = 0;
}

switch (c) {
case '\r':
case '\n':
Expand Down Expand Up @@ -1370,12 +1419,27 @@ static int parse_inline(cmark_parser *parser, subject *subj, cmark_node *parent,
cmark_chunk_rtrim(&contents);
}

// If not the initial line (in the subject) AND at the beginning of another line.
if (subj->column_offset != 0 && startpos + subj->column_offset == 0) {
// Trim leading whitespace.
bufsize_t before_trim = contents.len;
cmark_chunk_ltrim(&contents);

if (contents.len == 0)
break; // The contents were only whitespaces.

// Update the start source position.
startpos += before_trim - contents.len;
}

new_inl = make_str(subj, startpos, endpos - 1, contents);
}
if (new_inl != NULL) {
cmark_node_append_child(parent, new_inl);
}

subj->block_offset = saved_block_offset;

return 1;
}

Expand Down

0 comments on commit 8e1a5b8

Please sign in to comment.