diff --git a/src/lpython/parser/parser.yy b/src/lpython/parser/parser.yy index 79012d5be8..a93d3fbd83 100644 --- a/src/lpython/parser/parser.yy +++ b/src/lpython/parser/parser.yy @@ -103,6 +103,12 @@ void yyerror(YYLTYPE *yyloc, LCompilers::LPython::Parser &p, const std::string & %token TK_CARET "^" %token TK_AT "@" %token TK_STRING +%token TK_RAW_STRING +%token TK_UNI_STRING +%token TK_FMT_STRING +%token TK_RAW_FMT_STRING +%token TK_BYTES +%token TK_RAW_BYTES %token TK_COMMENT %token TK_EOLCOMMENT %token TK_TYPE_COMMENT @@ -1101,10 +1107,20 @@ subscript ; string - : string TK_STRING { $$ = STRING2($1, $2, @$); } // TODO - | string id TK_STRING { $$ = STRING4($1, STRING3($2, $3, @$), @$); } + : string TK_STRING { $$ = STRING4($1, $2, @$); } // TODO + | string TK_RAW_STRING { $$ = STRING5($1, STRING1($2, @$), @$); } + | string TK_UNI_STRING { $$ = STRING5($1, STRING2($2, @$), @$); } + | string TK_FMT_STRING { $$ = STRING5($1, STRING3($2, @$), @$); } + | string TK_RAW_FMT_STRING { $$ = STRING5($1, STRING3($2, @$), @$); } + | string TK_BYTES { $$ = STRING5($1, BYTES1($2, @$), @$); } + | string TK_RAW_BYTES { $$ = STRING5($1, BYTES1($2, @$), @$); } | TK_STRING { $$ = STRING1($1, @$); } - | id TK_STRING { $$ = STRING3($1, $2, @$); } + | TK_RAW_STRING { $$ = STRING1($1, @$); } + | TK_UNI_STRING { $$ = STRING2($1, @$); } + | TK_FMT_STRING { $$ = STRING3($1, @$); } + | TK_RAW_FMT_STRING { $$ = STRING3($1, @$); } + | TK_BYTES { $$ = BYTES1($1, @$); } + | TK_RAW_BYTES { $$ = BYTES1($1, @$); } ; lambda_parameter diff --git a/src/lpython/parser/semantics.h b/src/lpython/parser/semantics.h index f4059fa0d9..6256381565 100644 --- a/src/lpython/parser/semantics.h +++ b/src/lpython/parser/semantics.h @@ -798,87 +798,65 @@ static inline ast_t* concat_string(Allocator &al, Location &l, x.c_str(p.m_a), expr_contextType::Load) // `x.int_n` is of type BigInt but we store the int64_t directly in AST #define INTEGER(x, l) make_ConstantInt_t(p.m_a, l, x, nullptr) -#define STRING1(x, l) make_ConstantStr_t(p.m_a, l, str_unescape_c(p.m_a, x), nullptr) -#define STRING2(x, y, l) concat_string(p.m_a, l, EXPR(x), str_unescape_c(p.m_a, y), nullptr) -#define STRING3(id, x, l) PREFIX_STRING(p.m_a, l, name2char(id), x.c_str(p.m_a)) -#define STRING4(x, s, l) concat_string(p.m_a, l, EXPR(x), "", EXPR(s)) +#define STRING1(x, l) make_ConstantStr_t(p.m_a, l, x.c_str(p.m_a), nullptr) +#define STRING2(x, l) make_ConstantStr_t(p.m_a, l, x.c_str(p.m_a), LCompilers::s2c(p.m_a, "u")) +#define STRING3(x, l) FMT_STRING(p.m_a, l, x.c_str(p.m_a)) +#define STRING4(x, y, l) concat_string(p.m_a, l, EXPR(x), y.c_str(p.m_a), nullptr) +#define STRING5(x, s, l) concat_string(p.m_a, l, EXPR(x), "", EXPR(s)) +#define BYTES1(x, l) make_ConstantBytes_t(p.m_a, l, x.c_str(p.m_a), nullptr) #define FLOAT(x, l) make_ConstantFloat_t(p.m_a, l, x, nullptr) #define COMPLEX(x, l) make_ConstantComplex_t(p.m_a, l, 0, x, nullptr) #define BOOL(x, l) make_ConstantBool_t(p.m_a, l, x, nullptr) -static inline ast_t *PREFIX_STRING(Allocator &al, Location &l, char *prefix, char *s){ +static inline ast_t *FMT_STRING(Allocator &al, Location &l, char *s){ Vec exprs; exprs.reserve(al, 4); ast_t *tmp = nullptr; - if (strcmp(prefix, "U") == 0 ) { - return make_ConstantStr_t(al, l, s, nullptr); - } - for (size_t i = 0; i < strlen(prefix); i++) { - prefix[i] = tolower(prefix[i]); - } - if (strcmp(prefix, "f") == 0 || strcmp(prefix, "fr") == 0 - || strcmp(prefix, "rf") == 0) { - std::string str = std::string(s); - std::string s1 = "\""; - std::string id; - std::vector strs; - bool open_paren = false; - for (size_t i = 0; i < str.length(); i++) { + + std::string str = std::string(s); + std::string s1 = "\""; + std::string id; + std::vector strs; + bool open_paren = false; + for (size_t i = 0; i < str.length(); i++) { if(str[i] == '{') { - if(s1 != "\"") { + if(s1 != "\"") { s1.push_back('"'); strs.push_back(s1); s1 = "\""; - } - open_paren = true; + } + open_paren = true; } else if (str[i] != '}' && open_paren) { - id.push_back(s[i]); + id.push_back(s[i]); } else if (str[i] == '}') { - if(id != "") { + if(id != "") { strs.push_back(id); id = ""; - } - open_paren = false; + } + open_paren = false; } else if (!open_paren) { - s1.push_back(s[i]); + s1.push_back(s[i]); } if(i == str.length()-1 && s1 != "\"") { - s1.push_back('"'); - strs.push_back(s1); + s1.push_back('"'); + strs.push_back(s1); } - } + } - for (size_t i = 0; i < strs.size(); i++) { + for (size_t i = 0; i < strs.size(); i++) { if (strs[i][0] == '"') { - strs[i] = strs[i].substr(1, strs[i].length() - 2); - tmp = make_ConstantStr_t(al, l, LCompilers::s2c(al, strs[i]), nullptr); - exprs.push_back(al, down_cast(tmp)); + strs[i] = strs[i].substr(1, strs[i].length() - 2); + tmp = make_ConstantStr_t(al, l, LCompilers::s2c(al, strs[i]), nullptr); + exprs.push_back(al, down_cast(tmp)); } else { - tmp = make_Name_t(al, l, - LCompilers::s2c(al, strs[i]), expr_contextType::Load); - tmp = make_FormattedValue_t(al, l, EXPR(tmp), -1, nullptr); - exprs.push_back(al, down_cast(tmp)); + tmp = make_Name_t(al, l, + LCompilers::s2c(al, strs[i]), expr_contextType::Load); + tmp = make_FormattedValue_t(al, l, EXPR(tmp), -1, nullptr); + exprs.push_back(al, down_cast(tmp)); } - } - tmp = make_JoinedStr_t(al, l, exprs.p, exprs.size()); - } else if (strcmp(prefix, "b") == 0) { - LCompilers::Str s_; - s_.from_str(al, std::string(s)); - std::string str = std::string(str_unescape_c(al, s_)); - str = "b'" + str + "'"; - tmp = make_ConstantBytes_t(al, l, LCompilers::s2c(al, str), nullptr); - } else if ( strcmp(prefix, "br") == 0 || strcmp(prefix, "rb") == 0) { - std::string str = std::string(s); - str = "b'" + str + "'"; - tmp = make_ConstantBytes_t(al, l, LCompilers::s2c(al, str), nullptr); - } else if (strcmp(prefix, "r") == 0 ) { - tmp = make_ConstantStr_t(al, l, s, nullptr); - } else if (strcmp(prefix, "u") == 0 ) { - tmp = make_ConstantStr_t(al, l, s, LCompilers::s2c(al, "u")); - } else { - throw LCompilers::LCompilersException("The string is not recognized by the parser."); } - return tmp; + + return make_JoinedStr_t(al, l, exprs.p, exprs.size()); } static inline keyword_t *CALL_ARG_KW(Allocator &al, Location &l, diff --git a/src/lpython/parser/tokenizer.h b/src/lpython/parser/tokenizer.h index 3fe00226bb..e45553a2a3 100644 --- a/src/lpython/parser/tokenizer.h +++ b/src/lpython/parser/tokenizer.h @@ -3,6 +3,7 @@ #include #include +#include #include #define MAX_PAREN_LEVEL 200 @@ -54,18 +55,129 @@ class Tokenizer } // Return the current token as YYSTYPE::Str, strips first and last character - void token_str(Str &s) const + void token_str(Allocator &al, Str &s) const { s.p = (char*) tok + 1; s.n = cur-tok-2; + s.p = str_unescape_c(al, s); + s.n = strlen(s.p); } // Return the current token as YYSTYPE::Str, strips the first 3 and the last // 3 characters - void token_str3(Str &s) const + void token_str3(Allocator &al, Str &s) const { s.p = (char*) tok + 3; s.n = cur-tok-6; + s.p = str_unescape_c(al, s); + s.n = strlen(s.p); + } + + // Return the current token as YYSTYPE::Str, strips first 2 characters and last character + void token_raw_str(Str &s) const + { + s.p = (char*) tok + 2; + s.n = cur-tok-3; + } + + // Return the current token as YYSTYPE::Str, strips the first 4 and the last + // 3 characters + void token_raw_str3(Str &s) const + { + s.p = (char*) tok + 4; + s.n = cur-tok-7; + } + + // Return the current token as YYSTYPE::Str, strips first 2 and last character + void token_unicode_str(Allocator &al, Str &s) const + { + s.p = (char*) tok + 2; + s.n = cur-tok-3; + s.p = str_unescape_c(al, s); + s.n = strlen(s.p); + } + + // Return the current token as YYSTYPE::Str, strips the first 4 and the last 3 chars + void token_unicode_str3(Allocator &al, Str &s) const + { + s.p = (char*) tok + 4; + s.n = cur-tok-7; + s.p = str_unescape_c(al, s); + s.n = strlen(s.p); + } + + // Return the current token as YYSTYPE::Str, strips first 2 and last character + void token_fmt_str(Allocator &al, Str &s) const + { + s.p = (char*) tok + 2; + s.n = cur-tok-3; + s.p = str_unescape_c(al, s); + s.n = strlen(s.p); + } + + // Return the current token as YYSTYPE::Str, strips the first 4 and the last 3 chars + void token_fmt_str3(Allocator &al, Str &s) const + { + s.p = (char*) tok + 4; + s.n = cur-tok-7; + s.p = str_unescape_c(al, s); + s.n = strlen(s.p); + } + + // Return the current token as YYSTYPE::Str, strips first 3 and last character + void token_raw_fmt_str(Str &s) const + { + s.p = (char*) tok + 3; + s.n = cur-tok-4; + } + + // Return the current token as YYSTYPE::Str, strips the first 5 and last 3 chars + void token_raw_fmt_str3(Str &s) const + { + s.p = (char*) tok + 5; + s.n = cur-tok-8; + } + + // Return the current token as YYSTYPE::Str, replaces `"` with `'` and prepends 'b' + void token_bytes(Allocator &al, Str &s) const + { + s.p = (char*) tok + 2; + s.n = cur-tok-3; + std::string s_ = str_unescape_c(al, s); + s_ = "b'" + s_ + "'"; + s.p = s2c(al, s_); + s.n = strlen(s.p); + } + + // Return the current token as YYSTYPE::Str, replaces `"""` or `'''` with `'` and prepends 'b' + void token_bytes3(Allocator &al, Str &s) const + { + s.p = (char*) tok + 4; + s.n = cur-tok-7; + std::string s_ = str_unescape_c(al, s); + s_ = "b'" + s_ + "'"; + s.p = s2c(al, s_); + s.n = strlen(s.p); + } + + // Return the current token as YYSTYPE::Str, transforms the string to b'string' + void token_raw_bytes(Str &s) const + { + s.p = (char*) tok + 1; + s.n = cur-tok-1; + s.p[0] = 'b'; + s.p[1] = '\''; + s.p[s.n - 1] = '\''; + } + + // Return the current token as YYSTYPE::Str, transforms the string to b'string' + void token_raw_bytes3(Str &s) const + { + s.p = (char*) tok + 3; + s.n = cur-tok-5; + s.p[0] = 'b'; + s.p[1] = '\''; + s.p[s.n - 1] = '\''; } // Return the current token's location diff --git a/src/lpython/parser/tokenizer.re b/src/lpython/parser/tokenizer.re index a5074f46f5..dff5a18c08 100644 --- a/src/lpython/parser/tokenizer.re +++ b/src/lpython/parser/tokenizer.re @@ -290,6 +290,18 @@ int Tokenizer::lex(Allocator &al, YYSTYPE &yylval, Location &loc, diag::Diagnost | ("''" | "''" "\\"+) [^'\x00\\] | [^'\x00\\] )* "'''"; + raw_str1 = 'r' (string1 | string2); + raw_str2 = 'r' (string3 | string4); + unicode_str1 = 'u' (string1 | string2); + unicode_str2 = 'u' (string3 | string4); + fmt_str1 = 'f' (string1 | string2); + fmt_str2 = 'f' (string3 | string4); + raw_fmt_str1 = ('rf' | 'fr') (string1 | string2); + raw_fmt_str2 = ('rf' | 'fr') (string3 | string4); + bytes1 = 'b' (string1 | string2); + bytes2 = 'b' (string3 | string4); + raw_bytes1 = ('rb' | 'br') (string1 | string2); + raw_bytes2 = ('rb' | 'br') (string3 | string4); type_ignore = "#" whitespace? "type:" whitespace? "ignore" [^\n\x00]*; type_comment = "#" whitespace? "type:" whitespace? [^\n\x00]*; comment = "#" [^\n\x00]*; @@ -583,10 +595,28 @@ int Tokenizer::lex(Allocator &al, YYSTYPE &yylval, Location &loc, diag::Diagnost } //docstring { RET(TK_DOCSTRING) } - string1 { token_str(yylval.string); RET(TK_STRING) } - string2 { token_str(yylval.string); RET(TK_STRING) } - string3 { token_str3(yylval.string); RET(TK_STRING) } - string4 { token_str3(yylval.string); RET(TK_STRING) } + string1 { token_str(al, yylval.string); RET(TK_STRING) } + string2 { token_str(al, yylval.string); RET(TK_STRING) } + string3 { token_str3(al, yylval.string); RET(TK_STRING) } + string4 { token_str3(al, yylval.string); RET(TK_STRING) } + + raw_str1 { token_raw_str(yylval.string); RET(TK_RAW_STRING) } + raw_str2 { token_raw_str3(yylval.string); RET(TK_RAW_STRING) } + + unicode_str1 { token_unicode_str(al, yylval.string); RET(TK_UNI_STRING) } + unicode_str2 { token_unicode_str3(al, yylval.string); RET(TK_UNI_STRING) } + + fmt_str1 { token_fmt_str(al, yylval.string); RET(TK_FMT_STRING) } + fmt_str2 { token_fmt_str3(al, yylval.string); RET(TK_FMT_STRING) } + + raw_fmt_str1 { token_raw_fmt_str(yylval.string); RET(TK_RAW_FMT_STRING) } + raw_fmt_str2 { token_raw_fmt_str3(yylval.string); RET(TK_RAW_FMT_STRING) } + + bytes1 { token_bytes(al, yylval.string); RET(TK_BYTES) } + bytes2 { token_bytes3(al, yylval.string); RET(TK_BYTES) } + + raw_bytes1 { token_raw_bytes(yylval.string); RET(TK_RAW_BYTES) } + raw_bytes2 { token_raw_bytes3(yylval.string); RET(TK_RAW_BYTES) } name { token(yylval.string); RET(TK_NAME) } */ @@ -687,6 +717,12 @@ std::string token2text(const int token) T(TK_AT, "@") T(TK_STRING, "string") + T(TK_RAW_STRING, "raw_str") + T(TK_UNI_STRING, "unicode_str") + T(TK_FMT_STRING, "fmt_string") + T(TK_RAW_FMT_STRING, "raw_fmt_string") + T(TK_BYTES, "bytes") + T(TK_RAW_BYTES, "raw_bytes") T(TK_COMMENT, "comment") T(TK_EOLCOMMENT, "eolcomment") T(TK_TYPE_COMMENT, "type_comment") @@ -824,7 +860,13 @@ std::string pickle_token(int token, const YYSTYPE &yystype) t += " " + std::to_string(yystype.f); } else if (token == yytokentype::TK_IMAG_NUM) { t += " " + std::to_string(yystype.f) + "j"; - } else if (token == yytokentype::TK_STRING) { + } else if (token == yytokentype::TK_STRING + || token == yytokentype::TK_RAW_STRING + || token == yytokentype::TK_UNI_STRING + || token == yytokentype::TK_FMT_STRING + || token == yytokentype::TK_RAW_FMT_STRING + || token == yytokentype::TK_BYTES + || token == yytokentype::TK_RAW_BYTES) { t = t + " " + "\"" + str_escape_c(yystype.string.str()) + "\""; } else if (token == yytokentype::TK_TYPE_COMMENT) { t = t + " " + "\"" + yystype.string.str() + "\"";