From 37325a94448e73cd9aafc5ebf4c9f12ccd9db409 Mon Sep 17 00:00:00 2001 From: Charles Strahan Date: Wed, 21 Jul 2021 19:34:29 -0500 Subject: [PATCH] fix string scanning Fix handling of '$' char in strings. Fixes #15 --- corpus/basic.txt | 50 ++++++++++++++++++++++++++++++++++++++++++++++++ src/scanner.c | 21 ++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/corpus/basic.txt b/corpus/basic.txt index e27117c38e..dd794b3509 100644 --- a/corpus/basic.txt +++ b/corpus/basic.txt @@ -102,6 +102,56 @@ This works, too: ''$ (interpolation (identifier)) (escape_sequence))) +==================== +string ($) +==================== + +[ + "$" + "$\n" + "${x}" + "$${x}" + "$$${x}" +] + +--- + +(source_expression + (list + (string) + (string + (escape_sequence)) + (string + (interpolation (identifier))) + (string) + (string + (interpolation (identifier))))) + +==================== +indented string ($) +==================== + +[ + ''$'' + ''$''\n'' + ''${x}'' + ''$${x}'' + ''$$${x}'' +] + +--- + +(source_expression + (list + (indented_string) + (indented_string + (escape_sequence)) + (indented_string + (interpolation (identifier))) + (indented_string) + (indented_string + (interpolation (identifier))))) + ==================== uri ==================== diff --git a/src/scanner.c b/src/scanner.c index b33c1087df..37d842002a 100644 --- a/src/scanner.c +++ b/src/scanner.c @@ -18,7 +18,14 @@ static void skip(TSLexer *lexer) { } static bool scan_str(TSLexer *lexer) { + // We want to delegate the scanning of the start-of-string/end-of-string '"' + // character to the grammar defined in grammar.js. + // So the idea is we track if we've seen any string content, + // and if we see an unescaped '"' char _and_ we haven't consumed any string content, + // we return false to indicate to tree-sitter that our custom scanner has not found + // a token. bool has_content = false; + lexer->result_symbol = STR_CONTENT; while (true) { @@ -49,6 +56,11 @@ static bool scan_str(TSLexer *lexer) { } else { return false; } + } else if (lexer->lookahead != '"' && lexer->lookahead != '\\' ) { + // any char following '$' other than '"', '\\' and '{' (which was handled above) + // should be consumed as additional string content. + advance(lexer); + lexer->mark_end(lexer); } has_content = true; break; @@ -68,8 +80,11 @@ static bool scan_str(TSLexer *lexer) { } static bool scan_ind_str(TSLexer *lexer) { + // See the comment about has_content in scan_str(). bool has_content = false; + lexer->result_symbol = IND_STR_CONTENT; + while (true) { switch (lexer->lookahead) { case '$': @@ -81,7 +96,13 @@ static bool scan_ind_str(TSLexer *lexer) { } else { return false; } + } else if (lexer->lookahead != '\'') { + // any char following '$' other than '\'' and '{' (which was handled above) + // should be consumed as additional string content. + advance(lexer); + lexer->mark_end(lexer); } + has_content = true; break; case '\'': lexer->mark_end(lexer);