Unescape at tokenizer level

ubaidsk · ubaidsk · commit da3b5c038af5 · 2023-06-14T04:53:45.000+05:30
Support raw strings token

Support bytes and raw bytes

Add support for unicode, fmt, raw_fmt strings
diff --git a/src/lpython/parser/parser.yy b/src/lpython/parser/parser.yy
@@ -103,6 +103,12 @@ void yyerror(YYLTYPE *yyloc, LCompilers::LPython::Parser &p, const std::string &
 %token TK_CARET "^"
 %token TK_AT "@"
 %token <string> TK_STRING
+%token <string> TK_RAW_STRING
+%token <string> TK_UNI_STRING
+%token <string> TK_FMT_STRING
+%token <string> TK_RAW_FMT_STRING
+%token <string> TK_BYTES
+%token <string> TK_RAW_BYTES
 %token <string> TK_COMMENT
 %token <string> TK_EOLCOMMENT
 %token <string> TK_TYPE_COMMENT
@@ -1101,10 +1107,20 @@ subscript
     ;
 
 string
-    : string TK_STRING { $$ = STRING2($1, $2, @$); } // TODO
-    | string id TK_STRING { $$ = STRING4($1, STRING3($2, $3, @$), @$); }
+    : string TK_STRING { $$ = STRING4($1, $2, @$); } // TODO
+    | string TK_RAW_STRING { $$ = STRING5($1, STRING1($2, @$), @$); }
+    | string TK_UNI_STRING { $$ = STRING5($1, STRING2($2, @$), @$); }
+    | string TK_FMT_STRING { $$ = STRING5($1, STRING3($2, @$), @$); }
+    | string TK_RAW_FMT_STRING { $$ = STRING5($1, STRING3($2, @$), @$); }
+    | string TK_BYTES { $$ = STRING5($1, BYTES1($2, @$), @$); }
+    | string TK_RAW_BYTES { $$ = STRING5($1, BYTES1($2, @$), @$); }
     | TK_STRING { $$ = STRING1($1, @$); }
-    | id TK_STRING { $$ = STRING3($1, $2, @$); }
+    | TK_RAW_STRING { $$ = STRING1($1, @$); }
+    | TK_UNI_STRING { $$ = STRING2($1, @$); }
+    | TK_FMT_STRING { $$ = STRING3($1, @$); }
+    | TK_RAW_FMT_STRING { $$ = STRING3($1, @$); }
+    | TK_BYTES { $$ = BYTES1($1, @$); }
+    | TK_RAW_BYTES { $$ = BYTES1($1, @$); }
     ;
 
 lambda_parameter
diff --git a/src/lpython/parser/semantics.h b/src/lpython/parser/semantics.h
@@ -798,87 +798,65 @@ static inline ast_t* concat_string(Allocator &al, Location &l,
         x.c_str(p.m_a), expr_contextType::Load)
 // `x.int_n` is of type BigInt but we store the int64_t directly in AST
 #define INTEGER(x, l) make_ConstantInt_t(p.m_a, l, x, nullptr)
-#define STRING1(x, l) make_ConstantStr_t(p.m_a, l, str_unescape_c(p.m_a, x), nullptr)
-#define STRING2(x, y, l) concat_string(p.m_a, l, EXPR(x), str_unescape_c(p.m_a, y), nullptr)
-#define STRING3(id, x, l) PREFIX_STRING(p.m_a, l, name2char(id), x.c_str(p.m_a))
-#define STRING4(x, s, l) concat_string(p.m_a, l, EXPR(x), "", EXPR(s))
+#define STRING1(x, l) make_ConstantStr_t(p.m_a, l, x.c_str(p.m_a), nullptr)
+#define STRING2(x, l) make_ConstantStr_t(p.m_a, l, x.c_str(p.m_a), LCompilers::s2c(p.m_a, "u"))
+#define STRING3(x, l) FMT_STRING(p.m_a, l, x.c_str(p.m_a))
+#define STRING4(x, y, l) concat_string(p.m_a, l, EXPR(x), y.c_str(p.m_a), nullptr)
+#define STRING5(x, s, l) concat_string(p.m_a, l, EXPR(x), "", EXPR(s))
+#define BYTES1(x, l) make_ConstantBytes_t(p.m_a, l, x.c_str(p.m_a), nullptr)
 #define FLOAT(x, l) make_ConstantFloat_t(p.m_a, l, x, nullptr)
 #define COMPLEX(x, l) make_ConstantComplex_t(p.m_a, l, 0, x, nullptr)
 #define BOOL(x, l) make_ConstantBool_t(p.m_a, l, x, nullptr)
 
-static inline ast_t *PREFIX_STRING(Allocator &al, Location &l, char *prefix, char *s){
+static inline ast_t *FMT_STRING(Allocator &al, Location &l, char *s){
     Vec<expr_t *> exprs;
     exprs.reserve(al, 4);
     ast_t *tmp = nullptr;
-    if (strcmp(prefix, "U") == 0 ) {
-        return make_ConstantStr_t(al, l,  s, nullptr);
-    }
-    for (size_t i = 0; i < strlen(prefix); i++) {
-        prefix[i] = tolower(prefix[i]);
-    }
-    if (strcmp(prefix, "f") == 0 || strcmp(prefix, "fr") == 0
-            || strcmp(prefix, "rf") == 0) {
-        std::string str = std::string(s);
-        std::string s1 = "\"";
-        std::string id;
-        std::vector<std::string> strs;
-        bool open_paren = false;
-        for (size_t i = 0; i < str.length(); i++) {
+
+    std::string str = std::string(s);
+    std::string s1 = "\"";
+    std::string id;
+    std::vector<std::string> strs;
+    bool open_paren = false;
+    for (size_t i = 0; i < str.length(); i++) {
             if(str[i] == '{') {
-                if(s1 != "\"") {
+            if(s1 != "\"") {
                     s1.push_back('"');
                     strs.push_back(s1);
                     s1 = "\"";
-                }
-                open_paren = true;
+            }
+            open_paren = true;
             } else if (str[i] != '}' && open_paren) {
-                id.push_back(s[i]);
+            id.push_back(s[i]);
             } else if (str[i] == '}') {
-                if(id != "") {
+            if(id != "") {
                     strs.push_back(id);
                     id = "";
-                }
-                open_paren = false;
+            }
+            open_paren = false;
             } else if (!open_paren) {
-                s1.push_back(s[i]);
+            s1.push_back(s[i]);
             }
             if(i == str.length()-1 && s1 != "\"") {
-                s1.push_back('"');
-                strs.push_back(s1);
+            s1.push_back('"');
+            strs.push_back(s1);
             }
-        }
+    }
 
-        for (size_t i = 0; i < strs.size(); i++) {
+    for (size_t i = 0; i < strs.size(); i++) {
             if (strs[i][0] == '"') {
-                strs[i] = strs[i].substr(1, strs[i].length() - 2);
-                tmp = make_ConstantStr_t(al, l, LCompilers::s2c(al, strs[i]), nullptr);
-                exprs.push_back(al, down_cast<expr_t>(tmp));
+            strs[i] = strs[i].substr(1, strs[i].length() - 2);
+            tmp = make_ConstantStr_t(al, l, LCompilers::s2c(al, strs[i]), nullptr);
+            exprs.push_back(al, down_cast<expr_t>(tmp));
             } else {
-                tmp = make_Name_t(al, l,
-                        LCompilers::s2c(al, strs[i]), expr_contextType::Load);
-                tmp = make_FormattedValue_t(al, l, EXPR(tmp), -1, nullptr);
-                exprs.push_back(al, down_cast<expr_t>(tmp));
+            tmp = make_Name_t(al, l,
+                    LCompilers::s2c(al, strs[i]), expr_contextType::Load);
+            tmp = make_FormattedValue_t(al, l, EXPR(tmp), -1, nullptr);
+            exprs.push_back(al, down_cast<expr_t>(tmp));
             }
-        }
-        tmp = make_JoinedStr_t(al, l, exprs.p, exprs.size());
-    } else if (strcmp(prefix, "b") == 0) {
-        LCompilers::Str s_;
-        s_.from_str(al, std::string(s));
-        std::string str = std::string(str_unescape_c(al, s_));
-        str = "b'" + str + "'";
-        tmp = make_ConstantBytes_t(al, l, LCompilers::s2c(al, str), nullptr);
-    } else if ( strcmp(prefix, "br") == 0 || strcmp(prefix, "rb") == 0) {
-        std::string str = std::string(s);
-        str = "b'" + str + "'";
-        tmp = make_ConstantBytes_t(al, l, LCompilers::s2c(al, str), nullptr);
-    } else if (strcmp(prefix, "r") == 0 ) {
-        tmp = make_ConstantStr_t(al, l,  s, nullptr);
-    } else if (strcmp(prefix, "u") == 0 ) {
-        tmp = make_ConstantStr_t(al, l,  s, LCompilers::s2c(al, "u"));
-    } else {
-        throw LCompilers::LCompilersException("The string is not recognized by the parser.");
     }
-    return tmp;
+
+    return make_JoinedStr_t(al, l, exprs.p, exprs.size());
 }
 
 static inline keyword_t *CALL_ARG_KW(Allocator &al, Location &l,
diff --git a/src/lpython/parser/tokenizer.h b/src/lpython/parser/tokenizer.h
@@ -3,6 +3,7 @@
 
 #include <libasr/exception.h>
 #include <libasr/alloc.h>
+#include <libasr/string_utils.h>
 #include <lpython/parser/parser_stype.h>
 
 #define MAX_PAREN_LEVEL 200
@@ -54,18 +55,129 @@ class Tokenizer
     }
 
     // Return the current token as YYSTYPE::Str, strips first and last character
-    void token_str(Str &s) const
+    void token_str(Allocator &al, Str &s) const
     {
         s.p = (char*) tok + 1;
         s.n = cur-tok-2;
+        s.p = str_unescape_c(al, s);
+        s.n = strlen(s.p);
     }
 
     // Return the current token as YYSTYPE::Str, strips the first 3 and the last
     // 3 characters
-    void token_str3(Str &s) const
+    void token_str3(Allocator &al, Str &s) const
     {
         s.p = (char*) tok + 3;
         s.n = cur-tok-6;
+        s.p = str_unescape_c(al, s);
+        s.n = strlen(s.p);
+    }
+
+    // Return the current token as YYSTYPE::Str, strips first 2 characters and last character
+    void token_raw_str(Str &s) const
+    {
+        s.p = (char*) tok + 2;
+        s.n = cur-tok-3;
+    }
+
+    // Return the current token as YYSTYPE::Str, strips the first 4 and the last
+    // 3 characters
+    void token_raw_str3(Str &s) const
+    {
+        s.p = (char*) tok + 4;
+        s.n = cur-tok-7;
+    }
+
+    // Return the current token as YYSTYPE::Str, strips first 2 and last character
+    void token_unicode_str(Allocator &al, Str &s) const
+    {
+        s.p = (char*) tok + 2;
+        s.n = cur-tok-3;
+        s.p = str_unescape_c(al, s);
+        s.n = strlen(s.p);
+    }
+
+    // Return the current token as YYSTYPE::Str, strips the first 4 and the last 3 chars
+    void token_unicode_str3(Allocator &al, Str &s) const
+    {
+        s.p = (char*) tok + 4;
+        s.n = cur-tok-7;
+        s.p = str_unescape_c(al, s);
+        s.n = strlen(s.p);
+    }
+
+    // Return the current token as YYSTYPE::Str, strips first 2 and last character
+    void token_fmt_str(Allocator &al, Str &s) const
+    {
+        s.p = (char*) tok + 2;
+        s.n = cur-tok-3;
+        s.p = str_unescape_c(al, s);
+        s.n = strlen(s.p);
+    }
+
+    // Return the current token as YYSTYPE::Str, strips the first 4 and the last 3 chars
+    void token_fmt_str3(Allocator &al, Str &s) const
+    {
+        s.p = (char*) tok + 4;
+        s.n = cur-tok-7;
+        s.p = str_unescape_c(al, s);
+        s.n = strlen(s.p);
+    }
+
+    // Return the current token as YYSTYPE::Str, strips first 3 and last character
+    void token_raw_fmt_str(Str &s) const
+    {
+        s.p = (char*) tok + 3;
+        s.n = cur-tok-4;
+    }
+
+    // Return the current token as YYSTYPE::Str, strips the first 5 and last 3 chars
+    void token_raw_fmt_str3(Str &s) const
+    {
+        s.p = (char*) tok + 5;
+        s.n = cur-tok-8;
+    }
+
+    // Return the current token as YYSTYPE::Str, replaces `"` with `'` and prepends 'b'
+    void token_bytes(Allocator &al, Str &s) const
+    {
+        s.p = (char*) tok + 2;
+        s.n = cur-tok-3;
+        std::string s_ = str_unescape_c(al, s);
+        s_ = "b'" + s_ + "'";
+        s.p = s2c(al, s_);
+        s.n = strlen(s.p);
+    }
+
+    // Return the current token as YYSTYPE::Str, replaces `"""` or `'''` with `'` and prepends 'b'
+    void token_bytes3(Allocator &al, Str &s) const
+    {
+        s.p = (char*) tok + 4;
+        s.n = cur-tok-7;
+        std::string s_ = str_unescape_c(al, s);
+        s_ = "b'" + s_ + "'";
+        s.p = s2c(al, s_);
+        s.n = strlen(s.p);
+    }
+
+    // Return the current token as YYSTYPE::Str, transforms the string to b'string'
+    void token_raw_bytes(Str &s) const
+    {
+        s.p = (char*) tok + 1;
+        s.n = cur-tok-1;
+        s.p[0] = 'b';
+        s.p[1] = '\'';
+        s.p[s.n - 1] = '\'';
+    }
+
+    // Return the current token as YYSTYPE::Str, transforms the string to b'string'
+    void token_raw_bytes3(Str &s) const
+    {
+        s.p = (char*) tok + 3;
+        s.n = cur-tok-5;
+        s.p[0] = 'b';
+        s.p[1] = '\'';
+        s.p[s.n - 1] = '\'';
     }
 
     // Return the current token's location
diff --git a/src/lpython/parser/tokenizer.re b/src/lpython/parser/tokenizer.re
@@ -290,6 +290,18 @@ int Tokenizer::lex(Allocator &al, YYSTYPE &yylval, Location &loc, diag::Diagnost
                             | ("''" | "''" "\\"+) [^'\x00\\]
                             | [^'\x00\\] )*
                       "'''";
+            raw_str1 = 'r' (string1 | string2);
+            raw_str2 = 'r' (string3 | string4);
+            unicode_str1 = 'u' (string1 | string2);
+            unicode_str2 = 'u' (string3 | string4);
+            fmt_str1 = 'f' (string1 | string2);
+            fmt_str2 = 'f' (string3 | string4);
+            raw_fmt_str1 = ('rf' | 'fr') (string1 | string2);
+            raw_fmt_str2 = ('rf' | 'fr') (string3 | string4);
+            bytes1 = 'b' (string1 | string2);
+            bytes2 = 'b' (string3 | string4);
+            raw_bytes1 = ('rb' | 'br') (string1 | string2);
+            raw_bytes2 = ('rb' | 'br') (string3 | string4);
             type_ignore = "#" whitespace? "type:" whitespace? "ignore" [^\n\x00]*;
             type_comment = "#" whitespace? "type:" whitespace? [^\n\x00]*;
             comment = "#" [^\n\x00]*;
@@ -583,10 +595,28 @@ int Tokenizer::lex(Allocator &al, YYSTYPE &yylval, Location &loc, diag::Diagnost
             }
             //docstring { RET(TK_DOCSTRING) }
 
-            string1 { token_str(yylval.string); RET(TK_STRING) }
-            string2 { token_str(yylval.string); RET(TK_STRING) }
-            string3 { token_str3(yylval.string); RET(TK_STRING) }
-            string4 { token_str3(yylval.string); RET(TK_STRING) }
+            string1 { token_str(al, yylval.string); RET(TK_STRING) }
+            string2 { token_str(al, yylval.string); RET(TK_STRING) }
+            string3 { token_str3(al, yylval.string); RET(TK_STRING) }
+            string4 { token_str3(al, yylval.string); RET(TK_STRING) }
+
+            raw_str1 { token_raw_str(yylval.string); RET(TK_RAW_STRING) }
+            raw_str2 { token_raw_str3(yylval.string); RET(TK_RAW_STRING) }
+
+            unicode_str1 { token_unicode_str(al, yylval.string); RET(TK_UNI_STRING) }
+            unicode_str2 { token_unicode_str3(al, yylval.string); RET(TK_UNI_STRING) }
+
+            fmt_str1 { token_fmt_str(al, yylval.string); RET(TK_FMT_STRING) }
+            fmt_str2 { token_fmt_str3(al, yylval.string); RET(TK_FMT_STRING) }
+
+            raw_fmt_str1 { token_raw_fmt_str(yylval.string); RET(TK_RAW_FMT_STRING) }
+            raw_fmt_str2 { token_raw_fmt_str3(yylval.string); RET(TK_RAW_FMT_STRING) }
+
+            bytes1 { token_bytes(al, yylval.string); RET(TK_BYTES) }
+            bytes2 { token_bytes3(al, yylval.string); RET(TK_BYTES) }
+
+            raw_bytes1 { token_raw_bytes(yylval.string); RET(TK_RAW_BYTES) }
+            raw_bytes2 { token_raw_bytes3(yylval.string); RET(TK_RAW_BYTES) }
 
             name { token(yylval.string); RET(TK_NAME) }
         */
@@ -687,6 +717,12 @@ std::string token2text(const int token)
         T(TK_AT, "@")
 
         T(TK_STRING, "string")
+        T(TK_RAW_STRING, "raw_str")
+        T(TK_UNI_STRING, "unicode_str")
+        T(TK_FMT_STRING, "fmt_string")
+        T(TK_RAW_FMT_STRING, "raw_fmt_string")
+        T(TK_BYTES, "bytes")
+        T(TK_RAW_BYTES, "raw_bytes")
         T(TK_COMMENT, "comment")
         T(TK_EOLCOMMENT, "eolcomment")
         T(TK_TYPE_COMMENT, "type_comment")
@@ -824,7 +860,13 @@ std::string pickle_token(int token, const YYSTYPE &yystype)
         t += " " + std::to_string(yystype.f);
     } else if (token == yytokentype::TK_IMAG_NUM) {
         t += " " + std::to_string(yystype.f) + "j";
-    } else if (token == yytokentype::TK_STRING) {
+    } else if (token == yytokentype::TK_STRING
+        || token == yytokentype::TK_RAW_STRING
+        || token == yytokentype::TK_UNI_STRING
+        || token == yytokentype::TK_FMT_STRING
+        || token == yytokentype::TK_RAW_FMT_STRING
+        || token == yytokentype::TK_BYTES
+        || token == yytokentype::TK_RAW_BYTES) {
         t = t + " " + "\"" + str_escape_c(yystype.string.str()) + "\"";
     } else if (token == yytokentype::TK_TYPE_COMMENT) {
         t = t + " " + "\"" + yystype.string.str() + "\"";