From 15e117a6c449ed7ff1a1e082c7da813b40faa39b Mon Sep 17 00:00:00 2001 From: Jan Jurzitza Date: Wed, 8 Apr 2020 11:56:02 +0200 Subject: [PATCH 1/5] add unittest for current comment behavior --- src/dparse/lexer.d | 67 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/src/dparse/lexer.d b/src/dparse/lexer.d index 7212874b..e6b02089 100644 --- a/src/dparse/lexer.d +++ b/src/dparse/lexer.d @@ -2696,3 +2696,70 @@ unittest assert(l.front().type == tok!""); assert(!l.messages.empty); } + +// legacy code using compatibility comment and trailingComment +unittest +{ + import std.conv : to; + import std.exception : enforce; + + static immutable src = `/// this is a module. +// mixed +/// it can do stuff +module foo.bar; + +// hello + +/** + * some doc + * hello + */ +int x; /// very nice + +// TODO: do stuff +void main() { + #line 40 + /// could be better + writeln(":)"); +} + +/// end of file`; + + LexerConfig cf; + StringCache ca = StringCache(16); + + const tokens = getTokensForParser(src, cf, &ca); + + void assertEquals(T)(T a, T b, string what, string file = __FILE__, size_t line = __LINE__) + { + enforce(a == b, "Failed " ~ what ~ " '" ~ a.to!string ~ "' == '" ~ b.to!string ~ "'", file, line); + } + + void test(size_t index, IdType type, string comment, string trailingComment, + string file = __FILE__, size_t line = __LINE__) + { + assertEquals(tokens[index].type, type, "type", file, line); + assertEquals(tokens[index].comment, comment, "comment", file, line); + assertEquals(tokens[index].trailingComment, trailingComment, "trailingComment", file, line); + } + + test(0, tok!"module", "this is a module.\nit can do stuff", ""); + test(1, tok!"identifier", "", ""); + test(2, tok!".", "", ""); + test(3, tok!"identifier", "", ""); + test(4, tok!";", "", ""); + test(5, tok!"int", "some doc\nhello", ""); + test(6, tok!"identifier", "", ""); + test(7, tok!";", "", "very nice"); + test(8, tok!"void", "", ""); + test(9, tok!"identifier", "", ""); + test(10, tok!"(", "", ""); + test(11, tok!")", "", ""); + test(12, tok!"{", "", ""); + test(13, tok!"identifier", "could be better", ""); + test(14, tok!"(", "", ""); + test(15, tok!"stringLiteral", "", ""); + test(16, tok!")", "", ""); + test(17, tok!";", "", ""); + test(18, tok!"}", "", ""); +} From afe19b2e93073fb71f2f042f6bb6ab0f0908ee5d Mon Sep 17 00:00:00 2001 From: Jan Jurzitza Date: Wed, 8 Apr 2020 15:48:27 +0200 Subject: [PATCH 2/5] separate comment helpers into dparse.trivia --- src/dparse/lexer.d | 449 +------------------------------------- src/dparse/trivia.d | 509 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 511 insertions(+), 447 deletions(-) create mode 100644 src/dparse/trivia.d diff --git a/src/dparse/lexer.d b/src/dparse/lexer.d index e6b02089..21ee97d3 100644 --- a/src/dparse/lexer.d +++ b/src/dparse/lexer.d @@ -14,6 +14,8 @@ version (D_InlineAsm_X86_64) else version = iasm64NotWindows; } +public import dparse.trivia; + /// Operators private enum operators = [ ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=", @@ -1862,453 +1864,6 @@ if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) return DLexer(range, config, cache); } -/** - * Removes "decoration" such as leading whitespace, leading + and * characters, - * and places the result into the given output range - */ -public void unDecorateComment(T)(string comment, auto ref T outputRange) -if (isOutputRange!(T, string)) -in -{ - assert (comment.length >= 3); -} -do -{ - import std.string : chompPrefix, KeepTerminator, lineSplitter, stripRight; - - string leadingChars; - - enum LineType { none, normal, strange } - LineType prevLineType; - - switch (comment[0 .. 3]) - { - case "///": - foreach (line; lineSplitter!(KeepTerminator.yes)(comment)) - { - if (leadingChars.empty) - { - size_t k = 3; - while (k < line.length && (line[k] == ' ' || line[k] == '\t')) - k++; - leadingChars = line[0 .. k]; - } - outputRange.put(line.chompPrefix(leadingChars)); - } - break; - case "/++": - case "/**": - alias CL = MultiLineCommentHelper!(ElementEncodingType!(typeof(comment))); - CL cl = CL(comment); - cl.process(outputRange); - break; - default: - outputRange.put(comment); - } -} - -/// -unittest -{ - import std.array:array, appender; - import std.stdio:stderr; - stderr.writeln("Running unittest for unDecorateComment..."); - - string[] inputs = [ - "/***************\n*******************/", - "/***************\n *\n ******************/", - "/**\n*/", - "/** */", - "/***/", - "/******/", - "/** abcde1 */", - "/// abcde2\n/// abcde2", - "/**\n * stuff1\n */", - "/**\n *\n * stuff2\n */", - "/**\n *\n * stuff3\n *\n */", - "/**\n *\n * stuff4\n *\n*/", - "/**\n * abcde3\n * abcde3 \n */", - "/**\n * abcde4\n *\n * abcde4\n */", - "/**abcde5\n*abcde5\n*/", - "/** abcde6\n * abcde6\n*/", - "/**\n1\n\n\n\n*/", - "/**\r\n1\r\n\r\n\r\n\r\n*/", - "/**\na1\n\na2\n\n*/", - "/**b1\n*b2\n*b3*/", - "/**c1\n *c2\n *c3*/", - "/**d1\n *d2\n *d3\n*/", - "///a\fbc\n///def" - ]; - string[] outputs = [ - "", - "", - "", - "", - "", - "", - "abcde1", - "abcde2\nabcde2", - "stuff1", - "stuff2", - "stuff3", - "stuff4", - "abcde3\n abcde3", - "abcde4\n\nabcde4", - "abcde5\nabcde5", - "abcde6\nabcde6", - "1", - "1", - "a1\n\na2", - "b1\nb2\nb3", - "c1\nc2\nc3", - "d1\nd2\nd3", - "a\fbc\ndef" - ]; - - // tests where * and + are not interchangeable - string[2][] np = - [ - ["/**\n * d1\n d2\n */", "* d1\nd2"], - ["/**\n + d1\n d2\n */", "+ d1\nd2"], - ["/**d1\n\n\n*d2\n*/", "d1\n\n*d2"], - ]; - - assert(inputs.length == outputs.length); - foreach (pair; zip(inputs, outputs)) - { - foreach (b; [true, false]) - { - auto app = appender!string(); - unDecorateComment(b ? pair[0] : pair[0].replace("*", "+"), app); - assert(pair[1] == app.data, "[[" ~ pair[0] ~ "]] => [[" ~ app.data ~ "]]"); - } - } - foreach (pair; np) - { - auto app = appender!string(); - unDecorateComment(pair[0], app); - assert(pair[1] == app.data, "[[" ~ pair[0] ~ "]] => [[" ~ app.data ~ "]]"); - } - stderr.writeln("Unittest for unDecorateComment passed."); -} - -/** Gives a line per line view on DDOC comments of type `/++` and `/**` which - * makes easier to remove the decoration and in an almost 100% nogc way. */ -private struct MultiLineCommentHelper(CharType : const(char)) -{ - // this struct is more used as a 'function with nested functions' would. - this() @disable; - this(this) @disable; - auto opAssign(T)(T t) @disable; - -private: - - char[][] lines; - // either lines.length or lines.length-1, depending on if last line only closes - size_t lastLineInBlockPlusOne; - // either '*' or '+' - const(char) commentChar; - // either 0 or 1, depending on if first line only opens - ubyte firstLineInBlock; - - import std.ascii : isWhite; - - void stripIndent() @safe @nogc pure nothrow - { - if (lines.length < 2) - return; - size_t count; - foreach (const j; 0 .. lines[1].length) - if (!(lines[1][j]).isWhite) - { - count = j; - break; - } - if (count < 2) - return; - foreach (ref line; lines[1 .. $]) - { - foreach (const j; 0 .. line.length) - { - if (!(line[j]).isWhite) - break; - if (j == count - 1) - { - line = line[j .. $]; - break; - } - } - } - } - - void processFirstLine() @safe @nogc pure nothrow - { - assert(lines.length); - if (lines[0].length > 3) - { - foreach (const i; 1..lines[0].length) - { - if (lines[0][i] == commentChar) - { - if (i < lines[0].length - 2) - continue; - if (i == lines[0].length - 2 && lines[0][i+1] == '/') - { - lines[0][] = ' '; - break; - } - if (i == lines[0].length - 1) - { - lines[0][] = ' '; - break; - } - } - else - { - lines[0][0..i] = ' '; - break; - } - } - } - lines[0][0..3] = " "; - if (lines.length == 1 && - lines[0][$-2] == commentChar && lines[0][$-1] == '/') - { - lines[0][$-2..$] = " "; - } - foreach (const i; 0..lines[0].length) - if (!(lines[0][i].isWhite)) - return; - firstLineInBlock = 1; - } - - void processLastLine() @safe @nogc pure nothrow - { - lastLineInBlockPlusOne = lines.length; - if (lines.length == 1) - return; - size_t closeStartIndex = size_t.max; - foreach (const i; 0..lines[$-1].length) - { - if (lines[$-1][i] == commentChar) - { - if (closeStartIndex == size_t.max) - closeStartIndex = i; - if (i == lines[$-1].length - 2) - { - // see the FIXME note in unDecorate() - lastLineInBlockPlusOne = closeStartIndex == 0 ? lines.length-1 : lines.length; - - lines[$-1][closeStartIndex..$] = ' '; - break; - } - } - else - { - closeStartIndex = size_t.max; - lastLineInBlockPlusOne = lines.length; - } - } - } - - void unDecorate() @safe @nogc pure nothrow - { - if (lines.length == 1 || lines.length == 2 && lines[$-1].length == 0) - return; - bool allDecorated; - static immutable char[2][2] pattern = [[' ', '*'],[' ', '+']]; - const ubyte patternIndex = commentChar == '+'; - // first line is never decorated - const size_t lo = 1; - // although very uncommon, the last line can be decorated e.g in `* lastline */`: - // the first '*' is a deco if all prev lines are also decorated. - // FIXME: `hi` should be set to `lastLineInBlockPlusOne`... - const size_t hi = (lines[$-1].length > 1 && - (lines[$-1][0] == commentChar || lines[$-1][0..2] == pattern[patternIndex])) - ? lines.length : lines.length-1; - // deco with a leading white - foreach (const i; lo .. hi) - { - if (lines[i].length < 2) - break; - else if (lines[i][0..2] != pattern[patternIndex]) - break; - else if (i == hi-1) - allDecorated = true; - } - // deco w/o leading white - if (!allDecorated) - foreach (const i; lo .. hi) - { - if (lines[i].length == 0) - break; - if (lines[i][0] != commentChar) - break; - else if (i == hi-1) - allDecorated = true; - } - if (!allDecorated) - return; - - const size_t indexToChange = (lines[lo][0] == commentChar) ? 0 : 1; - foreach (ref line; lines[lo .. hi]) - line[indexToChange] = ' '; - } - - void stripLeft() @safe @nogc pure nothrow - { - foreach (const i; 0 .. lines[0].length) - if (!(lines[0][i]).isWhite) - { - lines[0] = lines[0][i..$]; - break; - } - if (lines.length == 1) - return; - while (true) - { - bool processColumn; - foreach (ref line; lines[1 .. lastLineInBlockPlusOne]) - { - if (line.length == 0) - continue; - if (!(line[0]).isWhite) - return; - processColumn = true; - } - if (!processColumn) - return; - foreach (ref line; lines[1 .. lastLineInBlockPlusOne]) - { - if (line.length == 0) - continue; - line = line[1..$]; - } - } - } - - void stripRight() @safe @nogc pure nothrow - { - foreach (ref line; lines[0 .. lines.length]) - { - if (line.length == 0) - continue; - if ((line[$-1]).isWhite) - { - size_t firstWhite = line.length; - while (firstWhite > 0 && (line[firstWhite-1]).isWhite) - firstWhite--; - line = line[0..firstWhite]; - } - } - } - - void run() @safe @nogc pure nothrow - { - stripIndent(); - processFirstLine(); - processLastLine(); - unDecorate(); - stripLeft(); - stripRight(); - } - -public: - - this(CharType[] text) @safe pure nothrow - { - assert(text.length >= 3 && text[0] == '/', - "MultiLineCommentHelper text must start with a comment in form /++ or /**"); - - commentChar = text[1]; - size_t startIndex, i; - Appender!(char[][]) linesApp; - linesApp.reserve(512); - - void storeLine(size_t endIndexPlusOne) - { - static if (isMutable!CharType) - linesApp ~= text[startIndex..endIndexPlusOne]; - else - linesApp ~= text[startIndex..endIndexPlusOne].dup; - } - - // if we go over text length (in \r\n) we already stored the line, so just exit there - while (i < text.length) - { - // check if next char is going to be end of text, store until then & break - if (i + 1 == text.length) - { - storeLine(text.length); - break; - } - if (text[i] == '\n') - { - storeLine(i); - startIndex = i + 1; - } - else if (i + 1 < text.length && text[i .. i+2] == "\r\n") - { - storeLine(i); - i++; - startIndex = i + 1; - } - i++; - } - lines = linesApp.data; - } - - void process(T)(ref T outbuffer) - { - run(); - outbuffer.reserve(lines.length * 90); - bool prevWritten, empties; - foreach (ref line; lines[firstLineInBlock .. lines.length]) - { - if (line.length != 0) - { - // close preceeding line - if (prevWritten) - outbuffer ~= "\n"; - // insert new empty line - if (prevWritten && empties) - outbuffer ~= "\n"; - - outbuffer ~= line; - prevWritten = true; - empties = false; - } - else empties = true; - } - } -} - -unittest -{ - import std.conv : to; - - alias SC = MultiLineCommentHelper!(immutable(char)); - - // checks full comment processing on the given string and compares the generated lines - void check(string comment, string[] lines, size_t lineNo = __LINE__) - { - auto sc = SC(comment); - sc.run(); - assert(sc.lines == lines, sc.lines.to!string ~ " != " ~ lines.to!string - ~ " (for check on line " ~ lineNo.to!string ~ ")"); - } - - // check common cases while typing - check("/++", [""]); - check("/++\r", [""]); - check("/++\n", [""]); - check("/++\r\n", [""]); - check("/++\r\n+", ["", "+"]); - check("/++\r\n+ ok", ["", "ok"]); - check("/++\r\n+ ok\r\n+/", ["", "ok", ""]); - check("/++/", [""]); -} - /** * Helper function used to avoid too much allocations while lexing. * diff --git a/src/dparse/trivia.d b/src/dparse/trivia.d new file mode 100644 index 00000000..2771fe9a --- /dev/null +++ b/src/dparse/trivia.d @@ -0,0 +1,509 @@ +/** + * Module to work with trivia tokens (`comment`, `whitespace`, + * `specialTokenSequence`) which are attached to tokens near them when source + * code gets tokenized. + */ +module dparse.trivia; + +import std.algorithm; +import std.array; +import std.range; +import std.string; +import std.traits; + +import dparse.lexer; + +enum CommentType : ubyte +{ + none, + docLine, + docBlock, + normalLine, + normalBlock, +} + +CommentType determineCommentType(string comment) pure nothrow @safe +{ + auto bytes = comment.representation; + auto index = bytes.startsWith( + "//".representation, + "/+".representation, + "/*".representation + ); + bool isDoc = bytes.length >= 3 && bytes[1] == bytes[2]; + switch (index) + { + case 1: + return isDoc ? CommentType.docLine : CommentType.normalLine; + case 2: + case 3: + return isDoc ? CommentType.docBlock : CommentType.normalBlock; + default: + return CommentType.none; + } +} + +/// +unittest +{ + assert (determineCommentType("/// hello") == CommentType.docLine); + assert (determineCommentType("/++ hello") == CommentType.docBlock); + assert (determineCommentType("/** hello") == CommentType.docBlock); + assert (determineCommentType("// hello") == CommentType.normalLine); + assert (determineCommentType("/+ hello") == CommentType.normalBlock); + assert (determineCommentType("/* hello") == CommentType.normalBlock); + assert (determineCommentType("/ hello") == CommentType.none); + assert (determineCommentType("/") == CommentType.none); +} + +bool isDocComment(CommentType type) @safe nothrow pure +{ + return type == CommentType.docLine || type == CommentType.docBlock; +} + +/** + * Removes "decoration" such as leading whitespace, leading + and * characters, + * and places the result into the given output range + */ +public void unDecorateComment(T)(string comment, auto ref T outputRange) +if (isOutputRange!(T, string)) +in +{ + assert (comment.length >= 3); +} +do +{ + import std.string : chompPrefix, KeepTerminator, lineSplitter, stripRight; + + string leadingChars; + + enum LineType { none, normal, strange } + LineType prevLineType; + + switch (comment[0 .. 3]) + { + case "///": + foreach (line; lineSplitter!(KeepTerminator.yes)(comment)) + { + if (leadingChars.empty) + { + size_t k = 3; + while (k < line.length && (line[k] == ' ' || line[k] == '\t')) + k++; + leadingChars = line[0 .. k]; + } + outputRange.put(line.chompPrefix(leadingChars)); + } + break; + case "/++": + case "/**": + alias CL = MultiLineCommentHelper!(ElementEncodingType!(typeof(comment))); + CL cl = CL(comment); + cl.process(outputRange); + break; + default: + outputRange.put(comment); + } +} + +/// +unittest +{ + import std.array:array, appender; + import std.stdio:stderr; + stderr.writeln("Running unittest for unDecorateComment..."); + + string[] inputs = [ + "/***************\n*******************/", + "/***************\n *\n ******************/", + "/**\n*/", + "/** */", + "/***/", + "/******/", + "/** abcde1 */", + "/// abcde2\n/// abcde2", + "/**\n * stuff1\n */", + "/**\n *\n * stuff2\n */", + "/**\n *\n * stuff3\n *\n */", + "/**\n *\n * stuff4\n *\n*/", + "/**\n * abcde3\n * abcde3 \n */", + "/**\n * abcde4\n *\n * abcde4\n */", + "/**abcde5\n*abcde5\n*/", + "/** abcde6\n * abcde6\n*/", + "/**\n1\n\n\n\n*/", + "/**\r\n1\r\n\r\n\r\n\r\n*/", + "/**\na1\n\na2\n\n*/", + "/**b1\n*b2\n*b3*/", + "/**c1\n *c2\n *c3*/", + "/**d1\n *d2\n *d3\n*/", + "///a\fbc\n///def" + ]; + string[] outputs = [ + "", + "", + "", + "", + "", + "", + "abcde1", + "abcde2\nabcde2", + "stuff1", + "stuff2", + "stuff3", + "stuff4", + "abcde3\n abcde3", + "abcde4\n\nabcde4", + "abcde5\nabcde5", + "abcde6\nabcde6", + "1", + "1", + "a1\n\na2", + "b1\nb2\nb3", + "c1\nc2\nc3", + "d1\nd2\nd3", + "a\fbc\ndef" + ]; + + // tests where * and + are not interchangeable + string[2][] np = + [ + ["/**\n * d1\n d2\n */", "* d1\nd2"], + ["/**\n + d1\n d2\n */", "+ d1\nd2"], + ["/**d1\n\n\n*d2\n*/", "d1\n\n*d2"], + ]; + + assert(inputs.length == outputs.length); + foreach (pair; zip(inputs, outputs)) + { + foreach (b; [true, false]) + { + auto app = appender!string(); + unDecorateComment(b ? pair[0] : pair[0].replace("*", "+"), app); + assert(pair[1] == app.data, "[[" ~ pair[0] ~ "]] => [[" ~ app.data ~ "]]"); + } + } + foreach (pair; np) + { + auto app = appender!string(); + unDecorateComment(pair[0], app); + assert(pair[1] == app.data, "[[" ~ pair[0] ~ "]] => [[" ~ app.data ~ "]]"); + } + stderr.writeln("Unittest for unDecorateComment passed."); +} + +/** Gives a line per line view on DDOC comments of type `/++` and `/**` which + * makes easier to remove the decoration and in an almost 100% nogc way. */ +private struct MultiLineCommentHelper(CharType : const(char)) +{ + // this struct is more used as a 'function with nested functions' would. + this() @disable; + this(this) @disable; + auto opAssign(T)(T t) @disable; + +private: + + char[][] lines; + // either lines.length or lines.length-1, depending on if last line only closes + size_t lastLineInBlockPlusOne; + // either '*' or '+' + const(char) commentChar; + // either 0 or 1, depending on if first line only opens + ubyte firstLineInBlock; + + import std.ascii : isWhite; + + void stripIndent() @safe @nogc pure nothrow + { + if (lines.length < 2) + return; + size_t count; + foreach (const j; 0 .. lines[1].length) + if (!(lines[1][j]).isWhite) + { + count = j; + break; + } + if (count < 2) + return; + foreach (ref line; lines[1 .. $]) + { + foreach (const j; 0 .. line.length) + { + if (!(line[j]).isWhite) + break; + if (j == count - 1) + { + line = line[j .. $]; + break; + } + } + } + } + + void processFirstLine() @safe @nogc pure nothrow + { + assert(lines.length); + if (lines[0].length > 3) + { + foreach (const i; 1..lines[0].length) + { + if (lines[0][i] == commentChar) + { + if (i < lines[0].length - 2) + continue; + if (i == lines[0].length - 2 && lines[0][i+1] == '/') + { + lines[0][] = ' '; + break; + } + if (i == lines[0].length - 1) + { + lines[0][] = ' '; + break; + } + } + else + { + lines[0][0..i] = ' '; + break; + } + } + } + lines[0][0..3] = " "; + if (lines.length == 1 && + lines[0][$-2] == commentChar && lines[0][$-1] == '/') + { + lines[0][$-2..$] = " "; + } + foreach (const i; 0..lines[0].length) + if (!(lines[0][i].isWhite)) + return; + firstLineInBlock = 1; + } + + void processLastLine() @safe @nogc pure nothrow + { + lastLineInBlockPlusOne = lines.length; + if (lines.length == 1) + return; + size_t closeStartIndex = size_t.max; + foreach (const i; 0..lines[$-1].length) + { + if (lines[$-1][i] == commentChar) + { + if (closeStartIndex == size_t.max) + closeStartIndex = i; + if (i == lines[$-1].length - 2) + { + // see the FIXME note in unDecorate() + lastLineInBlockPlusOne = closeStartIndex == 0 ? lines.length-1 : lines.length; + + lines[$-1][closeStartIndex..$] = ' '; + break; + } + } + else + { + closeStartIndex = size_t.max; + lastLineInBlockPlusOne = lines.length; + } + } + } + + void unDecorate() @safe @nogc pure nothrow + { + if (lines.length == 1 || lines.length == 2 && lines[$-1].length == 0) + return; + bool allDecorated; + static immutable char[2][2] pattern = [[' ', '*'],[' ', '+']]; + const ubyte patternIndex = commentChar == '+'; + // first line is never decorated + const size_t lo = 1; + // although very uncommon, the last line can be decorated e.g in `* lastline */`: + // the first '*' is a deco if all prev lines are also decorated. + // FIXME: `hi` should be set to `lastLineInBlockPlusOne`... + const size_t hi = (lines[$-1].length > 1 && + (lines[$-1][0] == commentChar || lines[$-1][0..2] == pattern[patternIndex])) + ? lines.length : lines.length-1; + // deco with a leading white + foreach (const i; lo .. hi) + { + if (lines[i].length < 2) + break; + else if (lines[i][0..2] != pattern[patternIndex]) + break; + else if (i == hi-1) + allDecorated = true; + } + // deco w/o leading white + if (!allDecorated) + foreach (const i; lo .. hi) + { + if (lines[i].length == 0) + break; + if (lines[i][0] != commentChar) + break; + else if (i == hi-1) + allDecorated = true; + } + if (!allDecorated) + return; + + const size_t indexToChange = (lines[lo][0] == commentChar) ? 0 : 1; + foreach (ref line; lines[lo .. hi]) + line[indexToChange] = ' '; + } + + void stripLeft() @safe @nogc pure nothrow + { + foreach (const i; 0 .. lines[0].length) + if (!(lines[0][i]).isWhite) + { + lines[0] = lines[0][i..$]; + break; + } + if (lines.length == 1) + return; + while (true) + { + bool processColumn; + foreach (ref line; lines[1 .. lastLineInBlockPlusOne]) + { + if (line.length == 0) + continue; + if (!(line[0]).isWhite) + return; + processColumn = true; + } + if (!processColumn) + return; + foreach (ref line; lines[1 .. lastLineInBlockPlusOne]) + { + if (line.length == 0) + continue; + line = line[1..$]; + } + } + } + + void stripRight() @safe @nogc pure nothrow + { + foreach (ref line; lines[0 .. lines.length]) + { + if (line.length == 0) + continue; + if ((line[$-1]).isWhite) + { + size_t firstWhite = line.length; + while (firstWhite > 0 && (line[firstWhite-1]).isWhite) + firstWhite--; + line = line[0..firstWhite]; + } + } + } + + void run() @safe @nogc pure nothrow + { + stripIndent(); + processFirstLine(); + processLastLine(); + unDecorate(); + stripLeft(); + stripRight(); + } + +public: + + this(CharType[] text) @safe pure nothrow + { + assert(text.length >= 3 && text[0] == '/', + "MultiLineCommentHelper text must start with a comment in form /++ or /**"); + + commentChar = text[1]; + size_t startIndex, i; + Appender!(char[][]) linesApp; + linesApp.reserve(512); + + void storeLine(size_t endIndexPlusOne) + { + static if (isMutable!CharType) + linesApp ~= text[startIndex..endIndexPlusOne]; + else + linesApp ~= text[startIndex..endIndexPlusOne].dup; + } + + // if we go over text length (in \r\n) we already stored the line, so just exit there + while (i < text.length) + { + // check if next char is going to be end of text, store until then & break + if (i + 1 == text.length) + { + storeLine(text.length); + break; + } + if (text[i] == '\n') + { + storeLine(i); + startIndex = i + 1; + } + else if (i + 1 < text.length && text[i .. i+2] == "\r\n") + { + storeLine(i); + i++; + startIndex = i + 1; + } + i++; + } + lines = linesApp.data; + } + + void process(T)(ref T outbuffer) + { + run(); + outbuffer.reserve(lines.length * 90); + bool prevWritten, empties; + foreach (ref line; lines[firstLineInBlock .. lines.length]) + { + if (line.length != 0) + { + // close preceeding line + if (prevWritten) + outbuffer ~= "\n"; + // insert new empty line + if (prevWritten && empties) + outbuffer ~= "\n"; + + outbuffer ~= line; + prevWritten = true; + empties = false; + } + else empties = true; + } + } +} + +unittest +{ + import std.conv : to; + + alias SC = MultiLineCommentHelper!(immutable(char)); + + // checks full comment processing on the given string and compares the generated lines + void check(string comment, string[] lines, size_t lineNo = __LINE__) + { + auto sc = SC(comment); + sc.run(); + assert(sc.lines == lines, sc.lines.to!string ~ " != " ~ lines.to!string + ~ " (for check on line " ~ lineNo.to!string ~ ")"); + } + + // check common cases while typing + check("/++", [""]); + check("/++\r", [""]); + check("/++\n", [""]); + check("/++\r\n", [""]); + check("/++\r\n+", ["", "+"]); + check("/++\r\n+ ok", ["", "ok"]); + check("/++\r\n+ ok\r\n+/", ["", "ok", ""]); + check("/++/", [""]); +} From 4d936690ef253d9b13344ed52480546b4ade5445 Mon Sep 17 00:00:00 2001 From: Jan Jurzitza Date: Wed, 8 Apr 2020 15:48:30 +0200 Subject: [PATCH 3/5] make token attributes not apply to extraFields for us this only changes the @nogc not being there anymore. This allows for more control within TokenStructure aliases to create GC dependent functions for example. --- src/std/experimental/lexer.d | 123 ++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 60 deletions(-) diff --git a/src/std/experimental/lexer.d b/src/std/experimental/lexer.d index a4568669..18700b6e 100644 --- a/src/std/experimental/lexer.d +++ b/src/std/experimental/lexer.d @@ -255,75 +255,78 @@ template TokenId(IdType, alias staticTokens, alias dynamicTokens, */ struct TokenStructure(IdType, string extraFields = "") { -public pure nothrow @safe @nogc: - - bool opEquals(ref const typeof(this) other) const + public pure nothrow @safe @nogc { - return this.type == other.type && this.text == other.text; - } - /** - * Returns: true if the token has the given type, false otherwise. - */ - bool opEquals(IdType type) const - { - return this.type == type; - } + bool opEquals(ref const typeof(this) other) const + { + return this.type == other.type && this.text == other.text; + } - /** - * Constructs a token from a token type. - * Params: type = the token type - */ - this(IdType type) - { - this.type = type; - } + /** + * Returns: true if the token has the given type, false otherwise. + */ + bool opEquals(IdType type) const + { + return this.type == type; + } - /** - * Constructs a token. - * Params: - * type = the token type - * text = the text of the token, which may be null - * line = the line number at which this token occurs - * column = the column number at which this token occurs - * index = the byte offset from the beginning of the input at which this - * token occurs - */ - this(IdType type, string text, size_t line, size_t column, size_t index) - { - this.text = text; - this.line = line; - this.column = column; - this.type = type; - this.index = index; - } + /** + * Constructs a token from a token type. + * Params: type = the token type + */ + this(IdType type) + { + this.type = type; + } - /** - * The _text of the token. - */ - string text; + /** + * Constructs a token. + * Params: + * type = the token type + * text = the text of the token, which may be null + * line = the line number at which this token occurs + * column = the column number at which this token occurs + * index = the byte offset from the beginning of the input at which this + * token occurs + */ + this(IdType type, string text, size_t line, size_t column, size_t index) + { + this.text = text; + this.line = line; + this.column = column; + this.type = type; + this.index = index; + } - /** - * The _line number at which this token occurs. - */ - size_t line; + /** + * The _text of the token. + */ + string text; - /** - * The _column number at which this token occurs. This is measured in bytes - * and may not be correct when tab characters are involved. - */ - size_t column; + /** + * The _line number at which this token occurs. + */ + size_t line; - /** - * The byte offset from the beginning of the input at which this token - * occurs. - */ - size_t index; + /** + * The _column number at which this token occurs. This is measured in bytes + * and may not be correct when tab characters are involved. + */ + size_t column; - /** - * The token type. - */ - IdType type; + /** + * The byte offset from the beginning of the input at which this token + * occurs. + */ + size_t index; + + /** + * The token type. + */ + IdType type; + + } mixin (extraFields); } From db2a6699287a944f9571fab9f470eec6f075d19b Mon Sep 17 00:00:00 2001 From: Jan Jurzitza Date: Wed, 8 Apr 2020 16:02:51 +0200 Subject: [PATCH 4/5] attach whitespace & comment tokens to tokens This replaces the previous comment and trailingComment properties. Maintains full feature backwards-compatibility, only some `@nogc` code may now fail to compile if it used the comment or trailingComment properties from Token. supersedes #149 --- src/dparse/lexer.d | 158 +++++++++++++++++++------------------ src/dparse/trivia.d | 187 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 270 insertions(+), 75 deletions(-) diff --git a/src/dparse/lexer.d b/src/dparse/lexer.d index 21ee97d3..26615431 100644 --- a/src/dparse/lexer.d +++ b/src/dparse/lexer.d @@ -119,20 +119,64 @@ public template tok(string token) alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token); } -private enum extraFields = q{ - string comment; - string trailingComment; +mixin template TokenTriviaFields() +{ + /** + * Whitespace and comment tokens attached to this token. + * + * All trivia tokens must have the text property set to the text with + * which they identify with. This means you can map all trivia tokens to + * their .text property and join them together to get the source code back + * without any loss of information. + * + * Trivia is only included when calling getTokensForParser. When iterating + * over DLexer all tokens will be in their raw form and none will be + * converted to trivia. + * + * Note: in the future you might need to explicitly pass + * WhitespaceBehavior.include (or keep the default) as getTokensForParser + * currently overrides it to include. + * + * Contains: `comment`, `whitespace`, `specialTokenSequence` + */ + const(typeof(this))[] leadingTrivia; + /// ditto + const(typeof(this))[] trailingTrivia; + + string memoizedLeadingComment = null; + string memoizedTrailingComment = null; + + /// Legacy property to get documentation comments, with comment border + /// stripped off, which is attached to this token. + string comment() const pure nothrow @safe @property { + import dparse.trivia : extractLeadingDdoc; + if (memoizedLeadingComment !is null) + return memoizedLeadingComment; + return (cast()memoizedLeadingComment) = this.extractLeadingDdoc; + } + + /// ditto + string trailingComment() const pure nothrow @safe @property { + import dparse.trivia : extractTrailingDdoc; + if (memoizedTrailingComment !is null) + return memoizedTrailingComment; + return (cast()memoizedTrailingComment) = this.extractTrailingDdoc; + } - int opCmp(size_t i) const pure nothrow @safe { + int opCmp(size_t i) const pure nothrow @safe @nogc { if (index < i) return -1; if (index > i) return 1; return 0; } - int opCmp(ref const typeof(this) other) const pure nothrow @safe { + int opCmp(ref const typeof(this) other) const pure nothrow @safe @nogc { return opCmp(other.index); } -}; +} + +// mixin in from dparse.lexer to make error messages more managable size as the +// entire string is dumped when there is a type mismatch. +private enum extraFields = "import dparse.lexer:TokenTriviaFields; mixin TokenTriviaFields;"; /// The token type in the D lexer public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields); @@ -410,97 +454,61 @@ public bool isLiteral(IdType type) pure nothrow @safe @nogc } /** - * Returns: an array of tokens lexed from the given source code to the output range. All - * whitespace tokens are skipped and comments are attached to the token nearest - * to them. + * Returns: an array of tokens lexed from the given source code to the output + * range. All whitespace, comment and specialTokenSequence tokens (trivia) are + * attached to the token nearest to them. + * + * Trivia is put on the last token as `trailingTrivia` if it is on the same + * line as the trivia, otherwise it will be attached to the next token in the + * `leadingTrivia` until there is the EOF, where it will be attached as + * `trailingTrivia` again. */ const(Token)[] getTokensForParser(R)(R sourceCode, LexerConfig config, StringCache* cache) if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) { - enum CommentType : ubyte - { - notDoc, - line, - block - } - - static CommentType commentType(string comment) pure nothrow @safe - { - if (comment.length < 3) - return CommentType.notDoc; - if (comment[0 ..3] == "///") - return CommentType.line; - if (comment[0 ..3] == "/++" || comment[0 ..3] == "/**") - return CommentType.block; - return CommentType.notDoc; - } - - config.whitespaceBehavior = WhitespaceBehavior.skip; + config.whitespaceBehavior = WhitespaceBehavior.include; config.commentBehavior = CommentBehavior.noIntern; - auto leadingCommentAppender = appender!(char[])(); - leadingCommentAppender.reserve(1024); - auto trailingCommentAppender = appender!(char[])(); - trailingCommentAppender.reserve(1024); - bool hadDdoc; - string empty = cache.intern(""); + auto leadingTriviaAppender = appender!(Token[])(); + leadingTriviaAppender.reserve(128); + auto trailingTriviaAppender = appender!(Token[])(); + trailingTriviaAppender.reserve(128); + auto output = appender!(typeof(return))(); auto lexer = DLexer(sourceCode, config, cache); - size_t tokenCount; loop: while (!lexer.empty) switch (lexer.front.type) { case tok!"specialTokenSequence": case tok!"whitespace": - lexer.popFront(); - break; case tok!"comment": - final switch (commentType(lexer.front.text)) - { - case CommentType.block: - case CommentType.line: - if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line) - { - if (!trailingCommentAppender.data.empty) - trailingCommentAppender.put('\n'); - unDecorateComment(lexer.front.text, trailingCommentAppender); - hadDdoc = true; - } - else - { - if (!leadingCommentAppender.data.empty) - leadingCommentAppender.put('\n'); - unDecorateComment(lexer.front.text, leadingCommentAppender); - hadDdoc = true; - } - lexer.popFront(); - break; - case CommentType.notDoc: - lexer.popFront(); - break; - } + if (!output.data.empty && lexer.front.line == output.data[$ - 1].line) + trailingTriviaAppender.put(lexer.front); + else + leadingTriviaAppender.put(lexer.front); + lexer.popFront(); break; case tok!"__EOF__": - if (!trailingCommentAppender.data.empty) - (cast() output.data[$ - 1].trailingComment) = cache.intern(cast(string) trailingCommentAppender.data); break loop; default: Token t = lexer.front; lexer.popFront(); - tokenCount++; - if (!output.data.empty && !trailingCommentAppender.data.empty) - { - (cast() output.data[$ - 1].trailingComment) = - cache.intern(cast(string) trailingCommentAppender.data); - hadDdoc = false; - } - t.comment = leadingCommentAppender.data.length > 0 - ? cache.intern(cast(string) leadingCommentAppender.data) : (hadDdoc ? empty : null); - leadingCommentAppender.clear(); - trailingCommentAppender.clear(); - hadDdoc = false; + + if (!output.data.empty && !trailingTriviaAppender.data.empty) + (cast() output.data[$ - 1].trailingTrivia) = trailingTriviaAppender.data.dup; + t.leadingTrivia = leadingTriviaAppender.data.dup; + leadingTriviaAppender.clear(); + trailingTriviaAppender.clear(); + output.put(t); break; } + + if (!output.data.empty) + { + trailingTriviaAppender.put(leadingTriviaAppender.data); + (cast() output.data[$ - 1].trailingTrivia) = trailingTriviaAppender.data.dup; + } + return output.data; } diff --git a/src/dparse/trivia.d b/src/dparse/trivia.d index 2771fe9a..0194b905 100644 --- a/src/dparse/trivia.d +++ b/src/dparse/trivia.d @@ -507,3 +507,190 @@ unittest check("/++\r\n+ ok\r\n+/", ["", "ok", ""]); check("/++/", [""]); } + +/// Extracts and combines ddoc comments from trivia comments. +string extractDdocFromTrivia(Tokens)(Tokens tokens) pure nothrow @safe + if (isInputRange!Tokens && is(ElementType!Tokens : Token)) +{ + auto ret = appender!string; + foreach (trivia; tokens) + { + if (trivia.type == tok!"comment" + && trivia.text.determineCommentType.isDocComment) + { + if (!ret.data.empty) + ret.put('\n'); + unDecorateComment(trivia.text, ret); + } + } + return ret.data; +} + +string extractLeadingDdoc(const Token token) pure nothrow @safe +{ + return extractDdocFromTrivia(token.leadingTrivia); +} + +string extractTrailingDdoc(const Token token) pure nothrow @safe +{ + return extractDdocFromTrivia(token.trailingTrivia.filter!(a => a.line == token.line)); +} + +// test token trivia members +unittest +{ + import std.conv : to; + import std.exception : enforce; + + static immutable src = `/// this is a module. +// mixed +/// it can do stuff +module foo.bar; + +// hello + +/** + * some doc + * hello + */ +int x; /// very nice + +// TODO: do stuff +void main() { + #line 40 + /// could be better + writeln(":)"); +} + +/// end of file`; + + LexerConfig cf; + StringCache ca = StringCache(16); + + const tokens = getTokensForParser(src, cf, &ca); + + assert(tokens.length == 19); + + assert(tokens[0].type == tok!"module"); + assert(tokens[0].leadingTrivia.length == 6); + assert(tokens[0].leadingTrivia[0].type == tok!"comment"); + assert(tokens[0].leadingTrivia[0].text == "/// this is a module."); + assert(tokens[0].leadingTrivia[1].type == tok!"whitespace"); + assert(tokens[0].leadingTrivia[1].text == "\n"); + assert(tokens[0].leadingTrivia[2].type == tok!"comment"); + assert(tokens[0].leadingTrivia[2].text == "// mixed"); + assert(tokens[0].leadingTrivia[3].type == tok!"whitespace"); + assert(tokens[0].leadingTrivia[3].text == "\n"); + assert(tokens[0].leadingTrivia[4].type == tok!"comment"); + assert(tokens[0].leadingTrivia[4].text == "/// it can do stuff"); + assert(tokens[0].leadingTrivia[5].type == tok!"whitespace"); + assert(tokens[0].leadingTrivia[5].text == "\n"); + assert(tokens[0].trailingTrivia.length == 1); + assert(tokens[0].trailingTrivia[0].type == tok!"whitespace"); + assert(tokens[0].trailingTrivia[0].text == " "); + + assert(tokens[1].type == tok!"identifier"); + assert(tokens[1].text == "foo"); + assert(!tokens[1].leadingTrivia.length); + assert(!tokens[1].trailingTrivia.length); + + assert(tokens[2].type == tok!"."); + assert(!tokens[2].leadingTrivia.length); + assert(!tokens[2].trailingTrivia.length); + + assert(tokens[3].type == tok!"identifier"); + assert(tokens[3].text == "bar"); + assert(!tokens[3].leadingTrivia.length); + assert(!tokens[3].trailingTrivia.length); + + assert(tokens[4].type == tok!";"); + assert(!tokens[4].leadingTrivia.length); + assert(tokens[4].trailingTrivia.length == 1); + assert(tokens[4].trailingTrivia[0].type == tok!"whitespace"); + assert(tokens[4].trailingTrivia[0].text == "\n\n"); + + assert(tokens[5].type == tok!"int"); + assert(tokens[5].leadingTrivia.length == 4); + assert(tokens[5].leadingTrivia[0].text == "// hello"); + assert(tokens[5].leadingTrivia[1].text == "\n\n"); + assert(tokens[5].leadingTrivia[2].text == "/**\n * some doc\n * hello\n */"); + assert(tokens[5].leadingTrivia[3].text == "\n"); + assert(tokens[5].trailingTrivia.length == 1); + assert(tokens[5].trailingTrivia[0].text == " "); + + assert(tokens[6].type == tok!"identifier"); + assert(tokens[6].text == "x"); + assert(!tokens[6].leadingTrivia.length); + assert(!tokens[6].trailingTrivia.length); + + assert(tokens[7].type == tok!";"); + assert(!tokens[7].leadingTrivia.length); + assert(tokens[7].trailingTrivia.length == 3); + assert(tokens[7].trailingTrivia[0].text == " "); + assert(tokens[7].trailingTrivia[1].text == "/// very nice"); + assert(tokens[7].trailingTrivia[2].text == "\n\n"); + + assert(tokens[8].type == tok!"void"); + assert(tokens[8].leadingTrivia.length == 2); + assert(tokens[8].leadingTrivia[0].text == "// TODO: do stuff"); + assert(tokens[8].leadingTrivia[1].text == "\n"); + assert(tokens[8].trailingTrivia.length == 1); + assert(tokens[8].trailingTrivia[0].text == " "); + + assert(tokens[9].type == tok!"identifier"); + assert(tokens[9].text == "main"); + assert(!tokens[9].leadingTrivia.length); + assert(!tokens[9].trailingTrivia.length); + + assert(tokens[10].type == tok!"("); + assert(!tokens[10].leadingTrivia.length); + assert(!tokens[10].trailingTrivia.length); + + assert(tokens[11].type == tok!")"); + assert(!tokens[11].leadingTrivia.length); + assert(tokens[11].trailingTrivia.length == 1); + assert(tokens[11].trailingTrivia[0].text == " "); + + assert(tokens[12].type == tok!"{"); + assert(!tokens[12].leadingTrivia.length); + assert(tokens[12].trailingTrivia.length == 1); + assert(tokens[12].trailingTrivia[0].text == "\n "); + + assert(tokens[13].type == tok!"identifier"); + assert(tokens[13].text == "writeln"); + assert(tokens[13].leadingTrivia.length == 4); + assert(tokens[13].leadingTrivia[0].type == tok!"specialTokenSequence"); + assert(tokens[13].leadingTrivia[0].text == "#line 40"); + assert(tokens[13].leadingTrivia[1].type == tok!"whitespace"); + assert(tokens[13].leadingTrivia[1].text == "\n "); + assert(tokens[13].leadingTrivia[2].type == tok!"comment"); + assert(tokens[13].leadingTrivia[2].text == "/// could be better"); + assert(tokens[13].leadingTrivia[3].type == tok!"whitespace"); + assert(tokens[13].leadingTrivia[3].text == "\n "); + assert(!tokens[13].trailingTrivia.length); + + assert(tokens[14].type == tok!"("); + assert(!tokens[14].leadingTrivia.length); + assert(!tokens[14].trailingTrivia.length); + + assert(tokens[15].type == tok!"stringLiteral"); + assert(!tokens[15].leadingTrivia.length); + assert(!tokens[15].trailingTrivia.length); + + assert(tokens[16].type == tok!")"); + assert(!tokens[16].leadingTrivia.length); + assert(!tokens[16].trailingTrivia.length); + + assert(tokens[17].type == tok!";"); + assert(!tokens[17].leadingTrivia.length); + assert(tokens[17].trailingTrivia.length == 1); + assert(tokens[17].trailingTrivia[0].text == "\n"); + + assert(tokens[18].type == tok!"}"); + assert(!tokens[18].leadingTrivia.length); + assert(tokens[18].trailingTrivia.length == 2); + assert(tokens[18].trailingTrivia[0].type == tok!"whitespace"); + assert(tokens[18].trailingTrivia[0].text == "\n\n"); + assert(tokens[18].trailingTrivia[1].type == tok!"comment"); + assert(tokens[18].trailingTrivia[1].text == "/// end of file"); +} From 14d5e3c14eb23f2f1538419b1cd4d9d6f0b13c1b Mon Sep 17 00:00:00 2001 From: Jan Jurzitza Date: Thu, 9 Apr 2020 07:27:56 +0200 Subject: [PATCH 5/5] forgot this file --- meson.build | 1 + 1 file changed, 1 insertion(+) diff --git a/meson.build b/meson.build index 0fba5df6..c3568bd7 100644 --- a/meson.build +++ b/meson.build @@ -20,6 +20,7 @@ dparse_src = [ 'src/dparse/parser.d', 'src/dparse/rollback_allocator.d', 'src/dparse/stack_buffer.d', + 'src/dparse/trivia.d', 'src/std/experimental/lexer.d', ]