From 37325a94448e73cd9aafc5ebf4c9f12ccd9db409 Mon Sep 17 00:00:00 2001
From: Charles Strahan <charles@cstrahan.com>
Date: Wed, 21 Jul 2021 19:34:29 -0500
Subject: [PATCH] fix string scanning

Fix handling of '$' char in strings.

Fixes #15
---
 corpus/basic.txt | 50 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/scanner.c    | 21 ++++++++++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/corpus/basic.txt b/corpus/basic.txt
index e27117c38e..dd794b3509 100644
--- a/corpus/basic.txt
+++ b/corpus/basic.txt
@@ -102,6 +102,56 @@ This works, too:  ''$
         (interpolation (identifier))
         (escape_sequence)))
 
+====================
+string ($)
+====================
+
+[
+    "$"
+    "$\n"
+    "${x}"
+    "$${x}"
+    "$$${x}"
+]
+
+---
+
+(source_expression
+    (list
+        (string)
+        (string
+            (escape_sequence))
+        (string
+            (interpolation (identifier)))
+        (string)
+        (string
+            (interpolation (identifier)))))
+
+====================
+indented string ($)
+====================
+
+[
+    ''$''
+    ''$''\n''
+    ''${x}''
+    ''$${x}''
+    ''$$${x}''
+]
+
+---
+
+(source_expression
+    (list
+        (indented_string)
+        (indented_string
+            (escape_sequence))
+        (indented_string
+            (interpolation (identifier)))
+        (indented_string)
+        (indented_string
+            (interpolation (identifier)))))
+
 ====================
 uri
 ====================
diff --git a/src/scanner.c b/src/scanner.c
index b33c1087df..37d842002a 100644
--- a/src/scanner.c
+++ b/src/scanner.c
@@ -18,7 +18,14 @@ static void skip(TSLexer *lexer) {
 }
 
 static bool scan_str(TSLexer *lexer) {
+  // We want to delegate the scanning of the start-of-string/end-of-string '"'
+  // character to the grammar defined in grammar.js.
+  // So the idea is we track if we've seen any string content,
+  // and if we see an unescaped '"' char _and_ we haven't consumed any string content,
+  // we return false to indicate to tree-sitter that our custom scanner has not found
+  // a token.
   bool has_content = false;
+  
   lexer->result_symbol = STR_CONTENT;
 
   while (true) {
@@ -49,6 +56,11 @@ static bool scan_str(TSLexer *lexer) {
           } else {
             return false;
           }
+        } else if (lexer->lookahead != '"' && lexer->lookahead != '\\' ) {
+          // any char following '$' other than '"', '\\' and '{' (which was handled above)
+          // should be consumed as additional string content.
+          advance(lexer);
+          lexer->mark_end(lexer);
         }
         has_content = true;
         break;
@@ -68,8 +80,11 @@ static bool scan_str(TSLexer *lexer) {
 }
 
 static bool scan_ind_str(TSLexer *lexer) {
+  // See the comment about has_content in scan_str().
   bool has_content = false;
+
   lexer->result_symbol = IND_STR_CONTENT;
+  
   while (true) {
     switch (lexer->lookahead) {
       case '$':
@@ -81,7 +96,13 @@ static bool scan_ind_str(TSLexer *lexer) {
           } else {
             return false;
           }
+        } else if (lexer->lookahead != '\'') {
+          // any char following '$' other than '\'' and '{' (which was handled above)
+          // should be consumed as additional string content.
+          advance(lexer);
+          lexer->mark_end(lexer);
         }
+        has_content = true;
         break;
       case '\'':
         lexer->mark_end(lexer);