-
Notifications
You must be signed in to change notification settings - Fork 3.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Question: Position of a token in the source string #2134
Comments
You are correct, we do not currently log the string positions of the tokens. You may be able to get something to work with the |
@nidoro We always appreciate PRs 😁👍 |
I just started using the library, and my knowledge of its inner workings is too little to make a pull request. But I did some changes that seem to be working. I'll explain what I did and would really appreciate your feedback to make sure I'm doing things correctly. I did some testing and things are working 99% of the time, but I'm still missing something. I've modified the lexer so that it returns the position of the token in the source. So each token returned by the
blockTokens(src, tokens, top, at)
inline(tokens, at)
inlineTokens(src, tokens, at, inLink, inRawBlock)
function lex(src) {
src = src.replace(/\r\n|\r/g, '\n').replace(/\t/g, ' ');
let at = {line: 0, column: 0, index: 0};
this.blockTokens(src, this.tokens, true, at);
at = {line: 0, column: 0, index: 0};
this.inline(this.tokens, at);
return this.tokens;
}
function copyAt(at) {
return {line: at.line, column: at.column, index: at.index};
}
// Advances the 'at' iterator by 'count' characters.
function advance(src, at, count) {
for (let i = 0; i < count; ++i) {
let c = src[i];
if (c == '\n') {
++at.line;
at.column = 0;
} else {
++at.column;
}
++at.index;
}
}
// Eats the token that starts 'src', meaning it sets the token
// start and end positions, advances the 'at' iterator to skip
// the token and returns the remaining string.
function eatToken(src, token, at) {
token.start = this.copyAt(at);
this.advance(src, at, token.raw.length-1);
token.end = this.copyAt(at);
this.advance(src[token.raw.length-1], at, 1);
return src.substring(token.raw.length);
}
I think the Click to see Lexer changes (I've only pasted the "Block Lexer" section of the file, which contains all the changes) /**
* Block Lexer
*/
var Lexer_1 = /*#__PURE__*/function () {
function Lexer(options) {
this.tokens = [];
this.tokens.links = Object.create(null);
this.options = options || defaults$3;
this.options.tokenizer = this.options.tokenizer || new Tokenizer$1();
this.tokenizer = this.options.tokenizer;
this.tokenizer.options = this.options;
var rules = {
block: block.normal,
inline: inline.normal
};
if (this.options.pedantic) {
rules.block = block.pedantic;
rules.inline = inline.pedantic;
} else if (this.options.gfm) {
rules.block = block.gfm;
if (this.options.breaks) {
rules.inline = inline.breaks;
} else {
rules.inline = inline.gfm;
}
}
this.tokenizer.rules = rules;
}
/**
* Expose Rules
*/
/**
* Static Lex Method
*/
Lexer.lex = function lex(src, options) {
var lexer = new Lexer(options);
return lexer.lex(src);
}
/**
* Static Lex Inline Method
*/
;
Lexer.lexInline = function lexInline(src, options) {
var lexer = new Lexer(options);
return lexer.inlineTokens(src);
}
/**
* Preprocessing
*/
;
var _proto = Lexer.prototype;
_proto.lex = function lex(src) {
src = src.replace(/\r\n|\r/g, '\n').replace(/\t/g, ' ');
let at = {line: 0, column: 0, index: 0};
this.blockTokens(src, this.tokens, true, at);
at = {line: 0, column: 0, index: 0};
this.inline(this.tokens, at);
return this.tokens;
}
/**
* Lexing
*/
;
_proto.copyAt = function copyAt(at) {
return {line: at.line, column: at.column, index: at.index};
}
_proto.advance = function advance(src, at, count) {
for (let i = 0; i < count; ++i) {
let c = src[i];
if (c == '\n') {
++at.line;
at.column = 0;
} else {
++at.column;
}
++at.index;
}
}
_proto.eatToken = function eatToken(src, token, at) {
token.start = this.copyAt(at);
this.advance(src, at, token.raw.length-1);
token.end = this.copyAt(at);
this.advance(src[token.raw.length-1], at, 1);
return src.substring(token.raw.length);
}
_proto.blockTokens = function blockTokens(src, tokens, top, at) {
var _this = this;
if (tokens === void 0) {
tokens = [];
}
if (top === void 0) {
top = true;
}
if (at === void 0) {
at = {line: 0, column: 0, index: 0};
}
if (this.options.pedantic) {
src = src.replace(/^ +$/gm, '');
}
var token, i, l, lastToken, cutSrc, lastParagraphClipped;
while (src) {
if (this.options.extensions && this.options.extensions.block && this.options.extensions.block.some(function (extTokenizer) {
if (token = extTokenizer.call(_this, src, tokens)) {
src = this.eatToken(src, token, at);
tokens.push(token);
return true;
}
return false;
})) {
continue;
} // newline
if (token = this.tokenizer.space(src)) {
src = this.eatToken(src, token, at);
if (token.type) {
tokens.push(token);
}
continue;
} // code
if (token = this.tokenizer.code(src)) {
src = this.eatToken(src, token, at);
lastToken = tokens[tokens.length - 1]; // An indented code block cannot interrupt a paragraph.
if (lastToken && lastToken.type === 'paragraph') {
lastToken.raw += '\n' + token.raw;
lastToken.text += '\n' + token.text;
} else {
tokens.push(token);
}
continue;
} // fences
if (token = this.tokenizer.fences(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // heading
if (token = this.tokenizer.heading(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // table no leading pipe (gfm)
if (token = this.tokenizer.nptable(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // hr
if (token = this.tokenizer.hr(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // blockquote
if (token = this.tokenizer.blockquote(src)) {
src = this.eatToken(src, token, at);
token.tokens = this.blockTokens(token.text, [], top, this.copyAt(at));
tokens.push(token);
continue;
} // list
if (token = this.tokenizer.list(src)) {
src = this.eatToken(src, token, at);
l = token.items.length;
for (i = 0; i < l; i++) {
token.items[i].tokens = this.blockTokens(token.items[i].text, [], false, this.copyAt(at));
}
tokens.push(token);
continue;
} // html
if (token = this.tokenizer.html(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // def
if (top && (token = this.tokenizer.def(src))) {
src = this.eatToken(src, token, at);
if (!this.tokens.links[token.tag]) {
this.tokens.links[token.tag] = {
href: token.href,
title: token.title
};
}
continue;
} // table (gfm)
if (token = this.tokenizer.table(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // lheading
if (token = this.tokenizer.lheading(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // top-level paragraph
// prevent paragraph consuming extensions by clipping 'src' to extension start
cutSrc = src;
if (this.options.extensions && this.options.extensions.startBlock) {
(function () {
var startIndex = Infinity;
var tempSrc = src.slice(1);
var tempStart = void 0;
_this.options.extensions.startBlock.forEach(function (getStartIndex) {
tempStart = getStartIndex.call(this, tempSrc);
if (typeof tempStart === 'number' && tempStart >= 0) {
startIndex = Math.min(startIndex, tempStart);
}
});
if (startIndex < Infinity && startIndex >= 0) {
cutSrc = src.substring(0, startIndex + 1);
}
})();
}
if (top && (token = this.tokenizer.paragraph(cutSrc))) {
lastToken = tokens[tokens.length - 1];
if (lastParagraphClipped && lastToken.type === 'paragraph') {
lastToken.raw += '\n' + token.raw;
lastToken.text += '\n' + token.text;
} else {
tokens.push(token);
}
lastParagraphClipped = cutSrc.length !== src.length;
src = this.eatToken(src, token, at);
continue;
} // text
if (token = this.tokenizer.text(src)) {
src = this.eatToken(src, token, at);
lastToken = tokens[tokens.length - 1];
if (lastToken && lastToken.type === 'text') {
lastToken.raw += '\n' + token.raw;
lastToken.text += '\n' + token.text;
} else {
tokens.push(token);
}
continue;
}
if (src) {
var errMsg = 'Infinite loop on byte: ' + src.charCodeAt(0);
if (this.options.silent) {
console.error(errMsg);
break;
} else {
throw new Error(errMsg);
}
}
}
return tokens;
};
_proto.inline = function inline(tokens, at) {
var i, j, k, l2, row, token;
var l = tokens.length;
for (i = 0; i < l; i++) {
token = tokens[i];
switch (token.type) {
case 'paragraph':
case 'text':
case 'heading':
{
token.tokens = [];
this.inlineTokens(token.text, token.tokens, {line: token.start.line, column: token.start.column});
break;
}
case 'table':
{
token.tokens = {
header: [],
cells: []
}; // header
l2 = token.header.length;
for (j = 0; j < l2; j++) {
token.tokens.header[j] = [];
this.inlineTokens(token.header[j], token.tokens.header[j], at);
} // cells
l2 = token.cells.length;
for (j = 0; j < l2; j++) {
row = token.cells[j];
token.tokens.cells[j] = [];
for (k = 0; k < row.length; k++) {
token.tokens.cells[j][k] = [];
this.inlineTokens(row[k], token.tokens.cells[j][k], at);
}
}
break;
}
case 'blockquote':
{
this.inline(token.tokens, at);
break;
}
case 'list':
{
l2 = token.items.length;
for (j = 0; j < l2; j++) {
this.inline(token.items[j].tokens, at);
}
break;
}
}
}
return tokens;
}
/**
* Lexing/Compiling
*/
;
_proto.inlineTokens = function inlineTokens(src, tokens, at, inLink, inRawBlock) {
var _this2 = this;
if (at === void 0) {
at = {line: 0, column: 0, index: 0};
}
if (tokens === void 0) {
tokens = [];
}
if (inLink === void 0) {
inLink = false;
}
if (inRawBlock === void 0) {
inRawBlock = false;
}
var token, lastToken, cutSrc; // String with links masked to avoid interference with em and strong
var maskedSrc = src;
var match;
var keepPrevChar, prevChar; // Mask out reflinks
if (this.tokens.links) {
var links = Object.keys(this.tokens.links);
if (links.length > 0) {
while ((match = this.tokenizer.rules.inline.reflinkSearch.exec(maskedSrc)) != null) {
if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) {
maskedSrc = maskedSrc.slice(0, match.index) + '[' + repeatString('a', match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.reflinkSearch.lastIndex);
}
}
}
} // Mask out other blocks
while ((match = this.tokenizer.rules.inline.blockSkip.exec(maskedSrc)) != null) {
maskedSrc = maskedSrc.slice(0, match.index) + '[' + repeatString('a', match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.blockSkip.lastIndex);
} // Mask out escaped em & strong delimiters
while ((match = this.tokenizer.rules.inline.escapedEmSt.exec(maskedSrc)) != null) {
maskedSrc = maskedSrc.slice(0, match.index) + '++' + maskedSrc.slice(this.tokenizer.rules.inline.escapedEmSt.lastIndex);
}
while (src) {
if (!keepPrevChar) {
prevChar = '';
}
keepPrevChar = false; // extensions
if (this.options.extensions && this.options.extensions.inline && this.options.extensions.inline.some(function (extTokenizer) {
if (token = extTokenizer.call(_this2, src, tokens)) {
src = this.eatToken(src, token, at);
tokens.push(token);
return true;
}
return false;
})) {
continue;
} // escape
if (token = this.tokenizer.escape(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // tag
if (token = this.tokenizer.tag(src, inLink, inRawBlock)) {
src = this.eatToken(src, token, at);
inLink = token.inLink;
inRawBlock = token.inRawBlock;
lastToken = tokens[tokens.length - 1];
if (lastToken && token.type === 'text' && lastToken.type === 'text') {
lastToken.raw += token.raw;
lastToken.text += token.text;
} else {
tokens.push(token);
}
continue;
} // link
if (token = this.tokenizer.link(src)) {
src = this.eatToken(src, token, at);
if (token.type === 'link') {
token.tokens = this.inlineTokens(token.text, [], this.copyAt(at), true, inRawBlock);
}
tokens.push(token);
continue;
} // reflink, nolink
if (token = this.tokenizer.reflink(src, this.tokens.links)) {
src = this.eatToken(src, token, at);
lastToken = tokens[tokens.length - 1];
if (token.type === 'link') {
token.tokens = this.inlineTokens(token.text, [], this.copyAt(at), true, inRawBlock);
tokens.push(token);
} else if (lastToken && token.type === 'text' && lastToken.type === 'text') {
lastToken.raw += token.raw;
lastToken.text += token.text;
} else {
tokens.push(token);
}
continue;
} // em & strong
if (token = this.tokenizer.emStrong(src, maskedSrc, prevChar)) {
src = this.eatToken(src, token, at);
token.tokens = this.inlineTokens(token.text, [], this.copyAt(at), inLink, inRawBlock);
tokens.push(token);
continue;
} // code
if (token = this.tokenizer.codespan(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // br
if (token = this.tokenizer.br(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // del (gfm)
if (token = this.tokenizer.del(src)) {
src = this.eatToken(src, token, at);
token.tokens = this.inlineTokens(token.text, [], this.copyAt(at), inLink, inRawBlock);
tokens.push(token);
continue;
} // autolink
if (token = this.tokenizer.autolink(src, mangle)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // url (gfm)
if (!inLink && (token = this.tokenizer.url(src, mangle))) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // text
// prevent inlineText consuming extensions by clipping 'src' to extension start
cutSrc = src;
if (this.options.extensions && this.options.extensions.startInline) {
(function () {
var startIndex = Infinity;
var tempSrc = src.slice(1);
var tempStart = void 0;
_this2.options.extensions.startInline.forEach(function (getStartIndex) {
tempStart = getStartIndex.call(this, tempSrc);
if (typeof tempStart === 'number' && tempStart >= 0) {
startIndex = Math.min(startIndex, tempStart);
}
});
if (startIndex < Infinity && startIndex >= 0) {
cutSrc = src.substring(0, startIndex + 1);
}
})();
}
if (token = this.tokenizer.inlineText(cutSrc, inRawBlock, smartypants)) {
src = this.eatToken(src, token, at);
if (token.raw.slice(-1) !== '_') {
// Track prevChar before string of ____ started
prevChar = token.raw.slice(-1);
}
keepPrevChar = true;
lastToken = tokens[tokens.length - 1];
if (lastToken && lastToken.type === 'text') {
lastToken.raw += token.raw;
lastToken.text += token.text;
} else {
tokens.push(token);
}
continue;
}
if (src) {
var errMsg = 'Infinite loop on byte: ' + src.charCodeAt(0);
if (this.options.silent) {
console.error(errMsg);
break;
} else {
throw new Error(errMsg);
}
}
}
return tokens;
};
_createClass(Lexer, null, [{
key: "rules",
get: function get() {
return {
block: block,
inline: inline
};
}
}]);
return Lexer;
}(); Like I said, everything seems to be working 99% os the time, but I've noticed an incorrect result for the following markdown source, and I suspect there are other cases that would generate incorrect results:
I appreciate any help on this. Thank you for the library! |
I think this is going to be much harder (nearly impossible) because of the line |
also this looks like it is going to slow marked down a lot checking every character for |
You raise valid points, but depending on the use case, they may or may not be of great importance. For instance, in my use case, we only feed Anyway, I do understand that this is not a highly demanded feature, but I do think this is an improvement on the library. Maybe you can make it optional, so the default behavior stays the same, but you have a I've fixed some bugs of my previous code. It is still not perfect, but it is better. The only (hopefully) problem I'm still having is with nested styles, like emphasis inside lists. But I think it is just a matter of time to get it 100%. Click to show code /**
* Block Lexer
*/
var Lexer_1 = /*#__PURE__*/function () {
function Lexer(options) {
this.tokens = [];
this.tokens.links = Object.create(null);
this.options = options || defaults$3;
this.options.tokenizer = this.options.tokenizer || new Tokenizer$1();
this.tokenizer = this.options.tokenizer;
this.tokenizer.options = this.options;
var rules = {
block: block.normal,
inline: inline.normal
};
if (this.options.pedantic) {
rules.block = block.pedantic;
rules.inline = inline.pedantic;
} else if (this.options.gfm) {
rules.block = block.gfm;
if (this.options.breaks) {
rules.inline = inline.breaks;
} else {
rules.inline = inline.gfm;
}
}
this.tokenizer.rules = rules;
}
/**
* Expose Rules
*/
/**
* Static Lex Method
*/
Lexer.lex = function lex(src, options) {
var lexer = new Lexer(options);
return lexer.lex(src);
}
/**
* Static Lex Inline Method
*/
;
Lexer.lexInline = function lexInline(src, options) {
var lexer = new Lexer(options);
return lexer.inlineTokens(src);
}
/**
* Preprocessing
*/
;
var _proto = Lexer.prototype;
_proto.lex = function lex(src) {
src = src.replace(/\r\n|\r/g, '\n').replace(/\t/g, ' ');
let at = {line: 0, column: 0, index: 0};
this.blockTokens(src, this.tokens, true, at);
at = {line: 0, column: 0, index: 0};
this.inline(this.tokens, at);
return this.tokens;
}
/**
* Lexing
*/
;
_proto.copyAt = function copyAt(at) {
return {line: at.line, column: at.column, index: at.index};
}
_proto.advance = function advance(src, at, count) {
for (let i = 0; i < count; ++i) {
let c = src[i];
if (c == '\n') {
++at.line;
at.column = 0;
} else {
++at.column;
}
++at.index;
}
}
_proto.eatToken = function eatToken(src, token, at) {
let textStartOffset = src.indexOf(token.text);
token.textStart = this.copyAt(at);
this.advance(src, token.textStart, textStartOffset);
token.start = this.copyAt(at);
this.advance(src, at, token.raw.length-1);
token.end = this.copyAt(at);
this.advance(src[token.raw.length-1], at, 1);
return src.substring(token.raw.length);
}
_proto.blockTokens = function blockTokens(src, tokens, top, at) {
var _this = this;
if (tokens === void 0) {
tokens = [];
}
if (top === void 0) {
top = true;
}
if (at === void 0) {
at = {line: 0, column: 0, index: 0};
}
if (this.options.pedantic) {
src = src.replace(/^ +$/gm, '');
}
var token, i, l, lastToken, cutSrc, lastParagraphClipped;
while (src) {
if (this.options.extensions && this.options.extensions.block && this.options.extensions.block.some(function (extTokenizer) {
if (token = extTokenizer.call(_this, src, tokens)) {
src = this.eatToken(src, token, at);
tokens.push(token);
return true;
}
return false;
})) {
continue;
} // newline
if (token = this.tokenizer.space(src)) {
src = this.eatToken(src, token, at);
if (token.type) {
tokens.push(token);
}
continue;
} // code
if (token = this.tokenizer.code(src)) {
src = this.eatToken(src, token, at);
lastToken = tokens[tokens.length - 1]; // An indented code block cannot interrupt a paragraph.
if (lastToken && lastToken.type === 'paragraph') {
lastToken.raw += '\n' + token.raw;
lastToken.text += '\n' + token.text;
} else {
tokens.push(token);
}
continue;
} // fences
if (token = this.tokenizer.fences(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // heading
if (token = this.tokenizer.heading(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // table no leading pipe (gfm)
if (token = this.tokenizer.nptable(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // hr
if (token = this.tokenizer.hr(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // blockquote
if (token = this.tokenizer.blockquote(src)) {
src = this.eatToken(src, token, at);
token.tokens = this.blockTokens(token.text, [], top, this.copyAt(token.textStart));
tokens.push(token);
continue;
} // list
if (token = this.tokenizer.list(src)) {
src = this.eatToken(src, token, at);
l = token.items.length;
for (i = 0; i < l; i++) {
token.items[i].tokens = this.blockTokens(token.items[i].text, [], false, this.copyAt(token.textStart));
}
tokens.push(token);
continue;
} // html
if (token = this.tokenizer.html(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // def
if (top && (token = this.tokenizer.def(src))) {
src = this.eatToken(src, token, at);
if (!this.tokens.links[token.tag]) {
this.tokens.links[token.tag] = {
href: token.href,
title: token.title
};
}
continue;
} // table (gfm)
if (token = this.tokenizer.table(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // lheading
if (token = this.tokenizer.lheading(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // top-level paragraph
// prevent paragraph consuming extensions by clipping 'src' to extension start
cutSrc = src;
if (this.options.extensions && this.options.extensions.startBlock) {
(function () {
var startIndex = Infinity;
var tempSrc = src.slice(1);
var tempStart = void 0;
_this.options.extensions.startBlock.forEach(function (getStartIndex) {
tempStart = getStartIndex.call(this, tempSrc);
if (typeof tempStart === 'number' && tempStart >= 0) {
startIndex = Math.min(startIndex, tempStart);
}
});
if (startIndex < Infinity && startIndex >= 0) {
cutSrc = src.substring(0, startIndex + 1);
}
})();
}
if (top && (token = this.tokenizer.paragraph(cutSrc))) {
lastToken = tokens[tokens.length - 1];
if (lastParagraphClipped && lastToken.type === 'paragraph') {
lastToken.raw += '\n' + token.raw;
lastToken.text += '\n' + token.text;
} else {
tokens.push(token);
}
lastParagraphClipped = cutSrc.length !== src.length;
src = this.eatToken(src, token, at);
continue;
} // text
if (token = this.tokenizer.text(src)) {
src = this.eatToken(src, token, at);
lastToken = tokens[tokens.length - 1];
if (lastToken && lastToken.type === 'text') {
lastToken.raw += '\n' + token.raw;
lastToken.text += '\n' + token.text;
} else {
tokens.push(token);
}
continue;
}
if (src) {
var errMsg = 'Infinite loop on byte: ' + src.charCodeAt(0);
if (this.options.silent) {
console.error(errMsg);
break;
} else {
throw new Error(errMsg);
}
}
}
return tokens;
};
_proto.inline = function inline(tokens, at) {
var i, j, k, l2, row, token;
var l = tokens.length;
for (i = 0; i < l; i++) {
token = tokens[i];
switch (token.type) {
case 'paragraph':
case 'text':
case 'heading':
{
token.tokens = [];
this.inlineTokens(token.text, token.tokens, this.copyAt(token.textStart));
break;
}
case 'table':
{
token.tokens = {
header: [],
cells: []
}; // header
l2 = token.header.length;
for (j = 0; j < l2; j++) {
token.tokens.header[j] = [];
this.inlineTokens(token.header[j], token.tokens.header[j], at);
} // cells
l2 = token.cells.length;
for (j = 0; j < l2; j++) {
row = token.cells[j];
token.tokens.cells[j] = [];
for (k = 0; k < row.length; k++) {
token.tokens.cells[j][k] = [];
this.inlineTokens(row[k], token.tokens.cells[j][k], at);
}
}
break;
}
case 'blockquote':
{
this.inline(token.tokens, at);
break;
}
case 'list':
{
l2 = token.items.length;
for (j = 0; j < l2; j++) {
this.inline(token.items[j].tokens, at);
}
break;
}
}
}
return tokens;
}
/**
* Lexing/Compiling
*/
;
_proto.inlineTokens = function inlineTokens(src, tokens, at, inLink, inRawBlock) {
var _this2 = this;
if (at === void 0) {
at = {line: 0, column: 0, index: 0};
}
if (tokens === void 0) {
tokens = [];
}
if (inLink === void 0) {
inLink = false;
}
if (inRawBlock === void 0) {
inRawBlock = false;
}
var token, lastToken, cutSrc; // String with links masked to avoid interference with em and strong
var maskedSrc = src;
var match;
var keepPrevChar, prevChar; // Mask out reflinks
if (this.tokens.links) {
var links = Object.keys(this.tokens.links);
if (links.length > 0) {
while ((match = this.tokenizer.rules.inline.reflinkSearch.exec(maskedSrc)) != null) {
if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) {
maskedSrc = maskedSrc.slice(0, match.index) + '[' + repeatString('a', match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.reflinkSearch.lastIndex);
}
}
}
} // Mask out other blocks
while ((match = this.tokenizer.rules.inline.blockSkip.exec(maskedSrc)) != null) {
maskedSrc = maskedSrc.slice(0, match.index) + '[' + repeatString('a', match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.blockSkip.lastIndex);
} // Mask out escaped em & strong delimiters
while ((match = this.tokenizer.rules.inline.escapedEmSt.exec(maskedSrc)) != null) {
maskedSrc = maskedSrc.slice(0, match.index) + '++' + maskedSrc.slice(this.tokenizer.rules.inline.escapedEmSt.lastIndex);
}
while (src) {
if (!keepPrevChar) {
prevChar = '';
}
keepPrevChar = false; // extensions
if (this.options.extensions && this.options.extensions.inline && this.options.extensions.inline.some(function (extTokenizer) {
if (token = extTokenizer.call(_this2, src, tokens)) {
src = this.eatToken(src, token, at);
tokens.push(token);
return true;
}
return false;
})) {
continue;
} // escape
if (token = this.tokenizer.escape(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // tag
if (token = this.tokenizer.tag(src, inLink, inRawBlock)) {
src = this.eatToken(src, token, at);
inLink = token.inLink;
inRawBlock = token.inRawBlock;
lastToken = tokens[tokens.length - 1];
if (lastToken && token.type === 'text' && lastToken.type === 'text') {
lastToken.raw += token.raw;
lastToken.text += token.text;
} else {
tokens.push(token);
}
continue;
} // link
if (token = this.tokenizer.link(src)) {
src = this.eatToken(src, token, at);
if (token.type === 'link') {
token.tokens = this.inlineTokens(token.text, [], this.copyAt(token.textStart), true, inRawBlock);
}
tokens.push(token);
continue;
} // reflink, nolink
if (token = this.tokenizer.reflink(src, this.tokens.links)) {
src = this.eatToken(src, token, at);
lastToken = tokens[tokens.length - 1];
if (token.type === 'link') {
token.tokens = this.inlineTokens(token.text, [], this.copyAt(token.textStart), true, inRawBlock);
tokens.push(token);
} else if (lastToken && token.type === 'text' && lastToken.type === 'text') {
lastToken.raw += token.raw;
lastToken.text += token.text;
} else {
tokens.push(token);
}
continue;
} // em & strong
if (token = this.tokenizer.emStrong(src, maskedSrc, prevChar)) {
src = this.eatToken(src, token, at);
token.tokens = this.inlineTokens(token.text, [], this.copyAt(token.textStart), inLink, inRawBlock);
tokens.push(token);
continue;
} // code
if (token = this.tokenizer.codespan(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // br
if (token = this.tokenizer.br(src)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // del (gfm)
if (token = this.tokenizer.del(src)) {
src = this.eatToken(src, token, at);
token.tokens = this.inlineTokens(token.text, [], this.copyAt(token.textStart), inLink, inRawBlock);
tokens.push(token);
continue;
} // autolink
if (token = this.tokenizer.autolink(src, mangle)) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // url (gfm)
if (!inLink && (token = this.tokenizer.url(src, mangle))) {
src = this.eatToken(src, token, at);
tokens.push(token);
continue;
} // text
// prevent inlineText consuming extensions by clipping 'src' to extension start
cutSrc = src;
if (this.options.extensions && this.options.extensions.startInline) {
(function () {
var startIndex = Infinity;
var tempSrc = src.slice(1);
var tempStart = void 0;
_this2.options.extensions.startInline.forEach(function (getStartIndex) {
tempStart = getStartIndex.call(this, tempSrc);
if (typeof tempStart === 'number' && tempStart >= 0) {
startIndex = Math.min(startIndex, tempStart);
}
});
if (startIndex < Infinity && startIndex >= 0) {
cutSrc = src.substring(0, startIndex + 1);
}
})();
}
if (token = this.tokenizer.inlineText(cutSrc, inRawBlock, smartypants)) {
src = this.eatToken(src, token, at);
if (token.raw.slice(-1) !== '_') {
// Track prevChar before string of ____ started
prevChar = token.raw.slice(-1);
}
keepPrevChar = true;
lastToken = tokens[tokens.length - 1];
if (lastToken && lastToken.type === 'text') {
lastToken.raw += token.raw;
lastToken.text += token.text;
} else {
tokens.push(token);
}
continue;
}
if (src) {
var errMsg = 'Infinite loop on byte: ' + src.charCodeAt(0);
if (this.options.silent) {
console.error(errMsg);
break;
} else {
throw new Error(errMsg);
}
}
}
return tokens;
};
_createClass(Lexer, null, [{
key: "rules",
get: function get() {
return {
block: block,
inline: inline
};
}
}]);
return Lexer;
}(); |
Another way to make it work is to let the user tell how many spaces a tab corresponds to. |
marked does save the |
Correct me if I'm wrong, but I think the |
This is simply merging two adjacent tokens. Occasionally we have to break a paragraph in half to check if a block of code or something else is beginning at that point. If it turns out that the second token is just the rest of a paragraph then we merge them back together. That's all that is happening; it should end up equivalent to the user input. |
In fact this is another good reason to do this in |
I see. I guess there is no advantage in changing the Lexer directly then. Thanks for the clarification. I'll probably still use the solution I'm working on, as it is nearly done, but if I ever feel the need of using the Again, thank you for the library! |
Hello again, I abandoned the idea of modifying the lexer. Now I'm trying to use the marked.lexer("> quote\n\nparagraph"); returns these tokens: [
{ type: "blockquote", raw: "> blockquote\n", … }
{ type: "paragraph", raw: "paragraph", … }
] If I tried to calculate the starting line of the paragraph (or any of the following tokens) using the previous token as a reference, the line would be 2 (if we count from 1). But in the original input, the paragraph is at line 3, like so:
I'm working around this behavior by adding a line when I encounter a blockquote, but I don't know if that's reliable. So I have two questions:
|
Welp, there is definitely no way marked.lexer("> quote\n# heading");
marked.lexer("> quote\n\n# heading"); return the exact same tokens: [
{ type: "blockquote", raw: "> blockquote\n", text: "blockquote\n", … }
{ type: "heading", raw: "# heading", depth: 1, … }
] So it is impossible to decide if the user entered one line or two lines after a blockquote. Is there a fix or workaround for this? |
So yes, it is a bug if the raw does not actually match the text that was consumed. |
Not yet, but if you want to create a PR we would be very appreciative 👍 It looks like the Lines 143 to 149 in e7b04a7
|
I have a similar need for a project. It's a notes app that uses Marked to render, with some interactivity in the html (like checking a checkbox) having a direct result in the source text (adding the x between [ ]). It would be very helpful for this kind of thing for Marked to expose the position of the rendered element in the source text. I'll have a look at the walkTokens approach sometime soon. I'm only interested in the offset for the token, but it should be easy to go from there to line/column values by counting the newlines up to the offset. |
So I've tried the walkTokens approach, but it went south pretty quickly. My approach was to keep a running sum of the length of the raw field in each token. It sort-of works for block level elements as long as the source text doesn't contain tabs or Windows line-endings, but even then I had to account for quirks in the parser (for instance the last newline of a paragraph is never shown in any raw field if the paragraph is immediately followed by another block-level element). For inline items it all falls down. You'd need to know their offset from their containing block element, but there's no clean way to figure out how much of the raw input was consumed by the block-level element. For instance an H1 heading can validly start with "# " or " # ". That influences the offset of its inline elements, but you'd need to re-parse the raw string to figure that out. I don't want to account for all current and future possibilities there, so I think this is a dead end. For reference, this is the walkTokens function I was testing with (WHICH GIVES INVALID RESULTS, you have been warned):
The app object is my global state. app.tokenoffset needs to be initialized to 0 before each render. |
Looks like |
|
Now my question is: how to render extra information from token into html, like _start, _end, shown above . The renderer doesn’t pass the token to rendering functions. so there are no extra token data can be render to DOM. for interactive application, we can’t get token info from the DOM users action on . Like click - [ ] task checkbox, toggle |
To get a checkbox change event you can just use JavaScript document.querySelector("input[type=checkbox]").addEventListener("change", () => {...}) |
Event is simple. Then, how to know which task item should be updated? for example:
when user click checkbox of the second Task1 , which text should we update and toggle? By searching the text from |
marked is only meant for converting markdown to html. For anything else you will need other tools. |
well, I think marked.js is great. More interative things can be done through extension. For the title of this issue "Position of a token in the source string", I have done thourgh marked.js's great extension way. However, we just talk about the "converting markdown to html" . The html generated from token can have more extensibility. So, just like @bartnv mentioned, he is building a notes app that uses Marked to render. Interactive features can be done in an extended way. I am doing the similay thing. And markedjs is so great for me. If it can provide more extensibility between token tree and html rendering, it will be better. Detail:https://marked.js.org/using_pro#block-level-renderer-methods Thanks for reply |
I'm looking for the feature too. I think there is certainly some demand for the feature for example passing the parsed markdown tokens (with lines and characters info) to a text editor e.g. VSCode so one can use it to develop an extension. Afaik there is no library that is fully capable to do it. I will try to take a look the code and develop the feature when I got a free time. But I'd love to know if you have a fork when you completed the patching but you cannot simply pull request the work to the repository. |
I have been maintaining a patch for line numbers: https://github.com/9001/copyparty/blob/hovudstraum/scripts/deps-docker/marked-ln.patch
|
It seems like to get this working we need to have something like token type (block or inline) in the walkTokens function. |
I've revisited this with version 4.0.12. The off-by-one errors in the token.raw are indeed gone, thanks for that UziTech. The only thing I would need from core markedjs is for each block level token in walkTokens to have the offset to the first child (inline) token within its raw string. So for a paragraph that would be the number of spaces and '#' characters at its start. For a list_item that would be leading spaces, '*' or '-' characters and a possible checkbox. Etc. I've looked briefly at how this could be accomplished. It would require setting the 'd' flag on the block level regexes to get the 'indices' property on the result that specifies the offsets of the submatches within the match. The tokenizer can then add this offset to the token object. If you'd be willing to consider this then I can prepare a PR. |
@bartnv that would be great if you could create a PR. The one thing I would want to watch out for is bringing down the speed of marked. You can run |
I have forked marked, applied the patch and published it to npm as Looking forward for this to be a standard feature of marked. |
Late to the party, but we've been trying to use marked in a wiki editor but unfortunately, though it's a great library, the mangling done to the src prior to, and during, lexing makes re-adding position tracking a negative force on performance. This is hard to undo at this point without a significant rewrite, but if done well, position tracking should be a positive (not negative) force on performance. Removing tabs for example is not necessary if the logic for identifying tokens can assimilate tabs as well. This would open marked up to many use cases that cannot be addressed currently (syntax highlighting of the source, link replacement across other MD documents when a target document is moved, etc.). Basically right now marked is trapped in a one-way translative role until this gets addressed. We'd PR it, but it's such a big "touch" to the code that it seems like it actually needs an in-project epic to migrate there. Is that a possibility? |
...or maybe promote it as a markdown renderer (its clear focus area: ideal for rendering into other language spaces) instead of as a general parser (for us it took time to find out it's not really usable as a parser except as a step towards rendering - a "true" parser suggests focus on the source document, not the output - we assumed it could do both and now are faced with switching to another lib). Might save others like us from investing time with marked for a purpose it declares but ultimately isn't designed to address. It's clearly excellent for rendering, but definitely not for source document analysis of any depth. This is meant as positive feedback - better to bill it as a car, not a pickup truck, and avoid surprises. |
Any way to improve marked is ok with me. 😁👍 I'm curious, at a high level, what would be needed to make marked work the way you explain?
If you would like to create a PR that updates docs/readme I would be ok with that as well. I do, however, disagree that marked is a markdown renderer. Marked does not render markdown, it parses markdown to render it as HTML. We do parse the markdown into tokens. Just because it doesn't have the information you want doesn't make it any less of a parser, but perhaps we could do a better job explaining what information is available. |
Ran into #3440 while attempting to expand on #2134 (comment). We've got a lot of markdown, and the tabs issue I'm pretty sure is the only thing standing in my way from having a working line counter for CI reporting. Once tab positions are resolved, the // There's more efficient ways, but this can work in a pinch for Windows and *nix.
const getLocation = (contents, offset) => {
let source = contents.slice(0, offset)
let line = 1
let next = -1
let prev = -1
while ((next = source.indexOf("\n", prev + 1)) >= 0) {
line++
prev = next
}
return {offset: startOffset, line, column: startOffset - prev}
}
const getSpanLocations = (contents, token, startOffset) => {
const endOffset = startOffset + token.raw.length
return {
start: getLocation(contents, startOffset),
end: getLocation(contents, endOffset),
}
} |
I'm working on a project (a syntax highlight for an editor) which require me to have access to the position of a token within the source string. After scanning through the lexer and parsing documentation I didn't find a way to do so. Ideally, for my use case, the tokens returned by the
lex(...)
function would contain the character position (line number and column number) of the start and end of the token (or token raw size, which I think it's already available).Is there already a way to do know the position of each token? If not, consider this a feature proposal :) I'm sure it is an easy thing to add.
The text was updated successfully, but these errors were encountered: