Skip to content

Commit da3b5c0

Browse files
committed
Unescape at tokenizer level
Support raw strings token Support bytes and raw bytes Add support for unicode, fmt, raw_fmt strings
1 parent d20a094 commit da3b5c0

File tree

4 files changed

+215
-67
lines changed

4 files changed

+215
-67
lines changed

src/lpython/parser/parser.yy

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,12 @@ void yyerror(YYLTYPE *yyloc, LCompilers::LPython::Parser &p, const std::string &
103103
%token TK_CARET "^"
104104
%token TK_AT "@"
105105
%token <string> TK_STRING
106+
%token <string> TK_RAW_STRING
107+
%token <string> TK_UNI_STRING
108+
%token <string> TK_FMT_STRING
109+
%token <string> TK_RAW_FMT_STRING
110+
%token <string> TK_BYTES
111+
%token <string> TK_RAW_BYTES
106112
%token <string> TK_COMMENT
107113
%token <string> TK_EOLCOMMENT
108114
%token <string> TK_TYPE_COMMENT
@@ -1101,10 +1107,20 @@ subscript
11011107
;
11021108

11031109
string
1104-
: string TK_STRING { $$ = STRING2($1, $2, @$); } // TODO
1105-
| string id TK_STRING { $$ = STRING4($1, STRING3($2, $3, @$), @$); }
1110+
: string TK_STRING { $$ = STRING4($1, $2, @$); } // TODO
1111+
| string TK_RAW_STRING { $$ = STRING5($1, STRING1($2, @$), @$); }
1112+
| string TK_UNI_STRING { $$ = STRING5($1, STRING2($2, @$), @$); }
1113+
| string TK_FMT_STRING { $$ = STRING5($1, STRING3($2, @$), @$); }
1114+
| string TK_RAW_FMT_STRING { $$ = STRING5($1, STRING3($2, @$), @$); }
1115+
| string TK_BYTES { $$ = STRING5($1, BYTES1($2, @$), @$); }
1116+
| string TK_RAW_BYTES { $$ = STRING5($1, BYTES1($2, @$), @$); }
11061117
| TK_STRING { $$ = STRING1($1, @$); }
1107-
| id TK_STRING { $$ = STRING3($1, $2, @$); }
1118+
| TK_RAW_STRING { $$ = STRING1($1, @$); }
1119+
| TK_UNI_STRING { $$ = STRING2($1, @$); }
1120+
| TK_FMT_STRING { $$ = STRING3($1, @$); }
1121+
| TK_RAW_FMT_STRING { $$ = STRING3($1, @$); }
1122+
| TK_BYTES { $$ = BYTES1($1, @$); }
1123+
| TK_RAW_BYTES { $$ = BYTES1($1, @$); }
11081124
;
11091125

11101126
lambda_parameter

src/lpython/parser/semantics.h

Lines changed: 35 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -798,87 +798,65 @@ static inline ast_t* concat_string(Allocator &al, Location &l,
798798
x.c_str(p.m_a), expr_contextType::Load)
799799
// `x.int_n` is of type BigInt but we store the int64_t directly in AST
800800
#define INTEGER(x, l) make_ConstantInt_t(p.m_a, l, x, nullptr)
801-
#define STRING1(x, l) make_ConstantStr_t(p.m_a, l, str_unescape_c(p.m_a, x), nullptr)
802-
#define STRING2(x, y, l) concat_string(p.m_a, l, EXPR(x), str_unescape_c(p.m_a, y), nullptr)
803-
#define STRING3(id, x, l) PREFIX_STRING(p.m_a, l, name2char(id), x.c_str(p.m_a))
804-
#define STRING4(x, s, l) concat_string(p.m_a, l, EXPR(x), "", EXPR(s))
801+
#define STRING1(x, l) make_ConstantStr_t(p.m_a, l, x.c_str(p.m_a), nullptr)
802+
#define STRING2(x, l) make_ConstantStr_t(p.m_a, l, x.c_str(p.m_a), LCompilers::s2c(p.m_a, "u"))
803+
#define STRING3(x, l) FMT_STRING(p.m_a, l, x.c_str(p.m_a))
804+
#define STRING4(x, y, l) concat_string(p.m_a, l, EXPR(x), y.c_str(p.m_a), nullptr)
805+
#define STRING5(x, s, l) concat_string(p.m_a, l, EXPR(x), "", EXPR(s))
806+
#define BYTES1(x, l) make_ConstantBytes_t(p.m_a, l, x.c_str(p.m_a), nullptr)
805807
#define FLOAT(x, l) make_ConstantFloat_t(p.m_a, l, x, nullptr)
806808
#define COMPLEX(x, l) make_ConstantComplex_t(p.m_a, l, 0, x, nullptr)
807809
#define BOOL(x, l) make_ConstantBool_t(p.m_a, l, x, nullptr)
808810

809-
static inline ast_t *PREFIX_STRING(Allocator &al, Location &l, char *prefix, char *s){
811+
static inline ast_t *FMT_STRING(Allocator &al, Location &l, char *s){
810812
Vec<expr_t *> exprs;
811813
exprs.reserve(al, 4);
812814
ast_t *tmp = nullptr;
813-
if (strcmp(prefix, "U") == 0 ) {
814-
return make_ConstantStr_t(al, l, s, nullptr);
815-
}
816-
for (size_t i = 0; i < strlen(prefix); i++) {
817-
prefix[i] = tolower(prefix[i]);
818-
}
819-
if (strcmp(prefix, "f") == 0 || strcmp(prefix, "fr") == 0
820-
|| strcmp(prefix, "rf") == 0) {
821-
std::string str = std::string(s);
822-
std::string s1 = "\"";
823-
std::string id;
824-
std::vector<std::string> strs;
825-
bool open_paren = false;
826-
for (size_t i = 0; i < str.length(); i++) {
815+
816+
std::string str = std::string(s);
817+
std::string s1 = "\"";
818+
std::string id;
819+
std::vector<std::string> strs;
820+
bool open_paren = false;
821+
for (size_t i = 0; i < str.length(); i++) {
827822
if(str[i] == '{') {
828-
if(s1 != "\"") {
823+
if(s1 != "\"") {
829824
s1.push_back('"');
830825
strs.push_back(s1);
831826
s1 = "\"";
832-
}
833-
open_paren = true;
827+
}
828+
open_paren = true;
834829
} else if (str[i] != '}' && open_paren) {
835-
id.push_back(s[i]);
830+
id.push_back(s[i]);
836831
} else if (str[i] == '}') {
837-
if(id != "") {
832+
if(id != "") {
838833
strs.push_back(id);
839834
id = "";
840-
}
841-
open_paren = false;
835+
}
836+
open_paren = false;
842837
} else if (!open_paren) {
843-
s1.push_back(s[i]);
838+
s1.push_back(s[i]);
844839
}
845840
if(i == str.length()-1 && s1 != "\"") {
846-
s1.push_back('"');
847-
strs.push_back(s1);
841+
s1.push_back('"');
842+
strs.push_back(s1);
848843
}
849-
}
844+
}
850845

851-
for (size_t i = 0; i < strs.size(); i++) {
846+
for (size_t i = 0; i < strs.size(); i++) {
852847
if (strs[i][0] == '"') {
853-
strs[i] = strs[i].substr(1, strs[i].length() - 2);
854-
tmp = make_ConstantStr_t(al, l, LCompilers::s2c(al, strs[i]), nullptr);
855-
exprs.push_back(al, down_cast<expr_t>(tmp));
848+
strs[i] = strs[i].substr(1, strs[i].length() - 2);
849+
tmp = make_ConstantStr_t(al, l, LCompilers::s2c(al, strs[i]), nullptr);
850+
exprs.push_back(al, down_cast<expr_t>(tmp));
856851
} else {
857-
tmp = make_Name_t(al, l,
858-
LCompilers::s2c(al, strs[i]), expr_contextType::Load);
859-
tmp = make_FormattedValue_t(al, l, EXPR(tmp), -1, nullptr);
860-
exprs.push_back(al, down_cast<expr_t>(tmp));
852+
tmp = make_Name_t(al, l,
853+
LCompilers::s2c(al, strs[i]), expr_contextType::Load);
854+
tmp = make_FormattedValue_t(al, l, EXPR(tmp), -1, nullptr);
855+
exprs.push_back(al, down_cast<expr_t>(tmp));
861856
}
862-
}
863-
tmp = make_JoinedStr_t(al, l, exprs.p, exprs.size());
864-
} else if (strcmp(prefix, "b") == 0) {
865-
LCompilers::Str s_;
866-
s_.from_str(al, std::string(s));
867-
std::string str = std::string(str_unescape_c(al, s_));
868-
str = "b'" + str + "'";
869-
tmp = make_ConstantBytes_t(al, l, LCompilers::s2c(al, str), nullptr);
870-
} else if ( strcmp(prefix, "br") == 0 || strcmp(prefix, "rb") == 0) {
871-
std::string str = std::string(s);
872-
str = "b'" + str + "'";
873-
tmp = make_ConstantBytes_t(al, l, LCompilers::s2c(al, str), nullptr);
874-
} else if (strcmp(prefix, "r") == 0 ) {
875-
tmp = make_ConstantStr_t(al, l, s, nullptr);
876-
} else if (strcmp(prefix, "u") == 0 ) {
877-
tmp = make_ConstantStr_t(al, l, s, LCompilers::s2c(al, "u"));
878-
} else {
879-
throw LCompilers::LCompilersException("The string is not recognized by the parser.");
880857
}
881-
return tmp;
858+
859+
return make_JoinedStr_t(al, l, exprs.p, exprs.size());
882860
}
883861

884862
static inline keyword_t *CALL_ARG_KW(Allocator &al, Location &l,

src/lpython/parser/tokenizer.h

Lines changed: 114 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include <libasr/exception.h>
55
#include <libasr/alloc.h>
6+
#include <libasr/string_utils.h>
67
#include <lpython/parser/parser_stype.h>
78

89
#define MAX_PAREN_LEVEL 200
@@ -54,18 +55,129 @@ class Tokenizer
5455
}
5556

5657
// Return the current token as YYSTYPE::Str, strips first and last character
57-
void token_str(Str &s) const
58+
void token_str(Allocator &al, Str &s) const
5859
{
5960
s.p = (char*) tok + 1;
6061
s.n = cur-tok-2;
62+
s.p = str_unescape_c(al, s);
63+
s.n = strlen(s.p);
6164
}
6265

6366
// Return the current token as YYSTYPE::Str, strips the first 3 and the last
6467
// 3 characters
65-
void token_str3(Str &s) const
68+
void token_str3(Allocator &al, Str &s) const
6669
{
6770
s.p = (char*) tok + 3;
6871
s.n = cur-tok-6;
72+
s.p = str_unescape_c(al, s);
73+
s.n = strlen(s.p);
74+
}
75+
76+
// Return the current token as YYSTYPE::Str, strips first 2 characters and last character
77+
void token_raw_str(Str &s) const
78+
{
79+
s.p = (char*) tok + 2;
80+
s.n = cur-tok-3;
81+
}
82+
83+
// Return the current token as YYSTYPE::Str, strips the first 4 and the last
84+
// 3 characters
85+
void token_raw_str3(Str &s) const
86+
{
87+
s.p = (char*) tok + 4;
88+
s.n = cur-tok-7;
89+
}
90+
91+
// Return the current token as YYSTYPE::Str, strips first 2 and last character
92+
void token_unicode_str(Allocator &al, Str &s) const
93+
{
94+
s.p = (char*) tok + 2;
95+
s.n = cur-tok-3;
96+
s.p = str_unescape_c(al, s);
97+
s.n = strlen(s.p);
98+
}
99+
100+
// Return the current token as YYSTYPE::Str, strips the first 4 and the last 3 chars
101+
void token_unicode_str3(Allocator &al, Str &s) const
102+
{
103+
s.p = (char*) tok + 4;
104+
s.n = cur-tok-7;
105+
s.p = str_unescape_c(al, s);
106+
s.n = strlen(s.p);
107+
}
108+
109+
// Return the current token as YYSTYPE::Str, strips first 2 and last character
110+
void token_fmt_str(Allocator &al, Str &s) const
111+
{
112+
s.p = (char*) tok + 2;
113+
s.n = cur-tok-3;
114+
s.p = str_unescape_c(al, s);
115+
s.n = strlen(s.p);
116+
}
117+
118+
// Return the current token as YYSTYPE::Str, strips the first 4 and the last 3 chars
119+
void token_fmt_str3(Allocator &al, Str &s) const
120+
{
121+
s.p = (char*) tok + 4;
122+
s.n = cur-tok-7;
123+
s.p = str_unescape_c(al, s);
124+
s.n = strlen(s.p);
125+
}
126+
127+
// Return the current token as YYSTYPE::Str, strips first 3 and last character
128+
void token_raw_fmt_str(Str &s) const
129+
{
130+
s.p = (char*) tok + 3;
131+
s.n = cur-tok-4;
132+
}
133+
134+
// Return the current token as YYSTYPE::Str, strips the first 5 and last 3 chars
135+
void token_raw_fmt_str3(Str &s) const
136+
{
137+
s.p = (char*) tok + 5;
138+
s.n = cur-tok-8;
139+
}
140+
141+
// Return the current token as YYSTYPE::Str, replaces `"` with `'` and prepends 'b'
142+
void token_bytes(Allocator &al, Str &s) const
143+
{
144+
s.p = (char*) tok + 2;
145+
s.n = cur-tok-3;
146+
std::string s_ = str_unescape_c(al, s);
147+
s_ = "b'" + s_ + "'";
148+
s.p = s2c(al, s_);
149+
s.n = strlen(s.p);
150+
}
151+
152+
// Return the current token as YYSTYPE::Str, replaces `"""` or `'''` with `'` and prepends 'b'
153+
void token_bytes3(Allocator &al, Str &s) const
154+
{
155+
s.p = (char*) tok + 4;
156+
s.n = cur-tok-7;
157+
std::string s_ = str_unescape_c(al, s);
158+
s_ = "b'" + s_ + "'";
159+
s.p = s2c(al, s_);
160+
s.n = strlen(s.p);
161+
}
162+
163+
// Return the current token as YYSTYPE::Str, transforms the string to b'string'
164+
void token_raw_bytes(Str &s) const
165+
{
166+
s.p = (char*) tok + 1;
167+
s.n = cur-tok-1;
168+
s.p[0] = 'b';
169+
s.p[1] = '\'';
170+
s.p[s.n - 1] = '\'';
171+
}
172+
173+
// Return the current token as YYSTYPE::Str, transforms the string to b'string'
174+
void token_raw_bytes3(Str &s) const
175+
{
176+
s.p = (char*) tok + 3;
177+
s.n = cur-tok-5;
178+
s.p[0] = 'b';
179+
s.p[1] = '\'';
180+
s.p[s.n - 1] = '\'';
69181
}
70182

71183
// Return the current token's location

src/lpython/parser/tokenizer.re

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,18 @@ int Tokenizer::lex(Allocator &al, YYSTYPE &yylval, Location &loc, diag::Diagnost
290290
| ("''" | "''" "\\"+) [^'\x00\\]
291291
| [^'\x00\\] )*
292292
"'''";
293+
raw_str1 = 'r' (string1 | string2);
294+
raw_str2 = 'r' (string3 | string4);
295+
unicode_str1 = 'u' (string1 | string2);
296+
unicode_str2 = 'u' (string3 | string4);
297+
fmt_str1 = 'f' (string1 | string2);
298+
fmt_str2 = 'f' (string3 | string4);
299+
raw_fmt_str1 = ('rf' | 'fr') (string1 | string2);
300+
raw_fmt_str2 = ('rf' | 'fr') (string3 | string4);
301+
bytes1 = 'b' (string1 | string2);
302+
bytes2 = 'b' (string3 | string4);
303+
raw_bytes1 = ('rb' | 'br') (string1 | string2);
304+
raw_bytes2 = ('rb' | 'br') (string3 | string4);
293305
type_ignore = "#" whitespace? "type:" whitespace? "ignore" [^\n\x00]*;
294306
type_comment = "#" whitespace? "type:" whitespace? [^\n\x00]*;
295307
comment = "#" [^\n\x00]*;
@@ -583,10 +595,28 @@ int Tokenizer::lex(Allocator &al, YYSTYPE &yylval, Location &loc, diag::Diagnost
583595
}
584596
//docstring { RET(TK_DOCSTRING) }
585597
586-
string1 { token_str(yylval.string); RET(TK_STRING) }
587-
string2 { token_str(yylval.string); RET(TK_STRING) }
588-
string3 { token_str3(yylval.string); RET(TK_STRING) }
589-
string4 { token_str3(yylval.string); RET(TK_STRING) }
598+
string1 { token_str(al, yylval.string); RET(TK_STRING) }
599+
string2 { token_str(al, yylval.string); RET(TK_STRING) }
600+
string3 { token_str3(al, yylval.string); RET(TK_STRING) }
601+
string4 { token_str3(al, yylval.string); RET(TK_STRING) }
602+
603+
raw_str1 { token_raw_str(yylval.string); RET(TK_RAW_STRING) }
604+
raw_str2 { token_raw_str3(yylval.string); RET(TK_RAW_STRING) }
605+
606+
unicode_str1 { token_unicode_str(al, yylval.string); RET(TK_UNI_STRING) }
607+
unicode_str2 { token_unicode_str3(al, yylval.string); RET(TK_UNI_STRING) }
608+
609+
fmt_str1 { token_fmt_str(al, yylval.string); RET(TK_FMT_STRING) }
610+
fmt_str2 { token_fmt_str3(al, yylval.string); RET(TK_FMT_STRING) }
611+
612+
raw_fmt_str1 { token_raw_fmt_str(yylval.string); RET(TK_RAW_FMT_STRING) }
613+
raw_fmt_str2 { token_raw_fmt_str3(yylval.string); RET(TK_RAW_FMT_STRING) }
614+
615+
bytes1 { token_bytes(al, yylval.string); RET(TK_BYTES) }
616+
bytes2 { token_bytes3(al, yylval.string); RET(TK_BYTES) }
617+
618+
raw_bytes1 { token_raw_bytes(yylval.string); RET(TK_RAW_BYTES) }
619+
raw_bytes2 { token_raw_bytes3(yylval.string); RET(TK_RAW_BYTES) }
590620
591621
name { token(yylval.string); RET(TK_NAME) }
592622
*/
@@ -687,6 +717,12 @@ std::string token2text(const int token)
687717
T(TK_AT, "@")
688718

689719
T(TK_STRING, "string")
720+
T(TK_RAW_STRING, "raw_str")
721+
T(TK_UNI_STRING, "unicode_str")
722+
T(TK_FMT_STRING, "fmt_string")
723+
T(TK_RAW_FMT_STRING, "raw_fmt_string")
724+
T(TK_BYTES, "bytes")
725+
T(TK_RAW_BYTES, "raw_bytes")
690726
T(TK_COMMENT, "comment")
691727
T(TK_EOLCOMMENT, "eolcomment")
692728
T(TK_TYPE_COMMENT, "type_comment")
@@ -824,7 +860,13 @@ std::string pickle_token(int token, const YYSTYPE &yystype)
824860
t += " " + std::to_string(yystype.f);
825861
} else if (token == yytokentype::TK_IMAG_NUM) {
826862
t += " " + std::to_string(yystype.f) + "j";
827-
} else if (token == yytokentype::TK_STRING) {
863+
} else if (token == yytokentype::TK_STRING
864+
|| token == yytokentype::TK_RAW_STRING
865+
|| token == yytokentype::TK_UNI_STRING
866+
|| token == yytokentype::TK_FMT_STRING
867+
|| token == yytokentype::TK_RAW_FMT_STRING
868+
|| token == yytokentype::TK_BYTES
869+
|| token == yytokentype::TK_RAW_BYTES) {
828870
t = t + " " + "\"" + str_escape_c(yystype.string.str()) + "\"";
829871
} else if (token == yytokentype::TK_TYPE_COMMENT) {
830872
t = t + " " + "\"" + yystype.string.str() + "\"";

0 commit comments

Comments
 (0)