Skip to content

Commit

Permalink
Unescape at tokenizer level
Browse files Browse the repository at this point in the history
Support raw strings token

Support bytes and raw bytes

Add support for unicode, fmt, raw_fmt strings
  • Loading branch information
Shaikh-Ubaid committed Jun 13, 2023
1 parent d20a094 commit da3b5c0
Show file tree
Hide file tree
Showing 4 changed files with 215 additions and 67 deletions.
22 changes: 19 additions & 3 deletions src/lpython/parser/parser.yy
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ void yyerror(YYLTYPE *yyloc, LCompilers::LPython::Parser &p, const std::string &
%token TK_CARET "^"
%token TK_AT "@"
%token <string> TK_STRING
%token <string> TK_RAW_STRING
%token <string> TK_UNI_STRING
%token <string> TK_FMT_STRING
%token <string> TK_RAW_FMT_STRING
%token <string> TK_BYTES
%token <string> TK_RAW_BYTES
%token <string> TK_COMMENT
%token <string> TK_EOLCOMMENT
%token <string> TK_TYPE_COMMENT
Expand Down Expand Up @@ -1101,10 +1107,20 @@ subscript
;

string
: string TK_STRING { $$ = STRING2($1, $2, @$); } // TODO
| string id TK_STRING { $$ = STRING4($1, STRING3($2, $3, @$), @$); }
: string TK_STRING { $$ = STRING4($1, $2, @$); } // TODO
| string TK_RAW_STRING { $$ = STRING5($1, STRING1($2, @$), @$); }
| string TK_UNI_STRING { $$ = STRING5($1, STRING2($2, @$), @$); }
| string TK_FMT_STRING { $$ = STRING5($1, STRING3($2, @$), @$); }
| string TK_RAW_FMT_STRING { $$ = STRING5($1, STRING3($2, @$), @$); }
| string TK_BYTES { $$ = STRING5($1, BYTES1($2, @$), @$); }
| string TK_RAW_BYTES { $$ = STRING5($1, BYTES1($2, @$), @$); }
| TK_STRING { $$ = STRING1($1, @$); }
| id TK_STRING { $$ = STRING3($1, $2, @$); }
| TK_RAW_STRING { $$ = STRING1($1, @$); }
| TK_UNI_STRING { $$ = STRING2($1, @$); }
| TK_FMT_STRING { $$ = STRING3($1, @$); }
| TK_RAW_FMT_STRING { $$ = STRING3($1, @$); }
| TK_BYTES { $$ = BYTES1($1, @$); }
| TK_RAW_BYTES { $$ = BYTES1($1, @$); }
;

lambda_parameter
Expand Down
92 changes: 35 additions & 57 deletions src/lpython/parser/semantics.h
Original file line number Diff line number Diff line change
Expand Up @@ -798,87 +798,65 @@ static inline ast_t* concat_string(Allocator &al, Location &l,
x.c_str(p.m_a), expr_contextType::Load)
// `x.int_n` is of type BigInt but we store the int64_t directly in AST
#define INTEGER(x, l) make_ConstantInt_t(p.m_a, l, x, nullptr)
#define STRING1(x, l) make_ConstantStr_t(p.m_a, l, str_unescape_c(p.m_a, x), nullptr)
#define STRING2(x, y, l) concat_string(p.m_a, l, EXPR(x), str_unescape_c(p.m_a, y), nullptr)
#define STRING3(id, x, l) PREFIX_STRING(p.m_a, l, name2char(id), x.c_str(p.m_a))
#define STRING4(x, s, l) concat_string(p.m_a, l, EXPR(x), "", EXPR(s))
#define STRING1(x, l) make_ConstantStr_t(p.m_a, l, x.c_str(p.m_a), nullptr)
#define STRING2(x, l) make_ConstantStr_t(p.m_a, l, x.c_str(p.m_a), LCompilers::s2c(p.m_a, "u"))
#define STRING3(x, l) FMT_STRING(p.m_a, l, x.c_str(p.m_a))
#define STRING4(x, y, l) concat_string(p.m_a, l, EXPR(x), y.c_str(p.m_a), nullptr)
#define STRING5(x, s, l) concat_string(p.m_a, l, EXPR(x), "", EXPR(s))
#define BYTES1(x, l) make_ConstantBytes_t(p.m_a, l, x.c_str(p.m_a), nullptr)
#define FLOAT(x, l) make_ConstantFloat_t(p.m_a, l, x, nullptr)
#define COMPLEX(x, l) make_ConstantComplex_t(p.m_a, l, 0, x, nullptr)
#define BOOL(x, l) make_ConstantBool_t(p.m_a, l, x, nullptr)

static inline ast_t *PREFIX_STRING(Allocator &al, Location &l, char *prefix, char *s){
static inline ast_t *FMT_STRING(Allocator &al, Location &l, char *s){
Vec<expr_t *> exprs;
exprs.reserve(al, 4);
ast_t *tmp = nullptr;
if (strcmp(prefix, "U") == 0 ) {
return make_ConstantStr_t(al, l, s, nullptr);
}
for (size_t i = 0; i < strlen(prefix); i++) {
prefix[i] = tolower(prefix[i]);
}
if (strcmp(prefix, "f") == 0 || strcmp(prefix, "fr") == 0
|| strcmp(prefix, "rf") == 0) {
std::string str = std::string(s);
std::string s1 = "\"";
std::string id;
std::vector<std::string> strs;
bool open_paren = false;
for (size_t i = 0; i < str.length(); i++) {

std::string str = std::string(s);
std::string s1 = "\"";
std::string id;
std::vector<std::string> strs;
bool open_paren = false;
for (size_t i = 0; i < str.length(); i++) {
if(str[i] == '{') {
if(s1 != "\"") {
if(s1 != "\"") {
s1.push_back('"');
strs.push_back(s1);
s1 = "\"";
}
open_paren = true;
}
open_paren = true;
} else if (str[i] != '}' && open_paren) {
id.push_back(s[i]);
id.push_back(s[i]);
} else if (str[i] == '}') {
if(id != "") {
if(id != "") {
strs.push_back(id);
id = "";
}
open_paren = false;
}
open_paren = false;
} else if (!open_paren) {
s1.push_back(s[i]);
s1.push_back(s[i]);
}
if(i == str.length()-1 && s1 != "\"") {
s1.push_back('"');
strs.push_back(s1);
s1.push_back('"');
strs.push_back(s1);
}
}
}

for (size_t i = 0; i < strs.size(); i++) {
for (size_t i = 0; i < strs.size(); i++) {
if (strs[i][0] == '"') {
strs[i] = strs[i].substr(1, strs[i].length() - 2);
tmp = make_ConstantStr_t(al, l, LCompilers::s2c(al, strs[i]), nullptr);
exprs.push_back(al, down_cast<expr_t>(tmp));
strs[i] = strs[i].substr(1, strs[i].length() - 2);
tmp = make_ConstantStr_t(al, l, LCompilers::s2c(al, strs[i]), nullptr);
exprs.push_back(al, down_cast<expr_t>(tmp));
} else {
tmp = make_Name_t(al, l,
LCompilers::s2c(al, strs[i]), expr_contextType::Load);
tmp = make_FormattedValue_t(al, l, EXPR(tmp), -1, nullptr);
exprs.push_back(al, down_cast<expr_t>(tmp));
tmp = make_Name_t(al, l,
LCompilers::s2c(al, strs[i]), expr_contextType::Load);
tmp = make_FormattedValue_t(al, l, EXPR(tmp), -1, nullptr);
exprs.push_back(al, down_cast<expr_t>(tmp));
}
}
tmp = make_JoinedStr_t(al, l, exprs.p, exprs.size());
} else if (strcmp(prefix, "b") == 0) {
LCompilers::Str s_;
s_.from_str(al, std::string(s));
std::string str = std::string(str_unescape_c(al, s_));
str = "b'" + str + "'";
tmp = make_ConstantBytes_t(al, l, LCompilers::s2c(al, str), nullptr);
} else if ( strcmp(prefix, "br") == 0 || strcmp(prefix, "rb") == 0) {
std::string str = std::string(s);
str = "b'" + str + "'";
tmp = make_ConstantBytes_t(al, l, LCompilers::s2c(al, str), nullptr);
} else if (strcmp(prefix, "r") == 0 ) {
tmp = make_ConstantStr_t(al, l, s, nullptr);
} else if (strcmp(prefix, "u") == 0 ) {
tmp = make_ConstantStr_t(al, l, s, LCompilers::s2c(al, "u"));
} else {
throw LCompilers::LCompilersException("The string is not recognized by the parser.");
}
return tmp;

return make_JoinedStr_t(al, l, exprs.p, exprs.size());
}

static inline keyword_t *CALL_ARG_KW(Allocator &al, Location &l,
Expand Down
116 changes: 114 additions & 2 deletions src/lpython/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <libasr/exception.h>
#include <libasr/alloc.h>
#include <libasr/string_utils.h>
#include <lpython/parser/parser_stype.h>

#define MAX_PAREN_LEVEL 200
Expand Down Expand Up @@ -54,18 +55,129 @@ class Tokenizer
}

// Return the current token as YYSTYPE::Str, strips first and last character
void token_str(Str &s) const
void token_str(Allocator &al, Str &s) const
{
s.p = (char*) tok + 1;
s.n = cur-tok-2;
s.p = str_unescape_c(al, s);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, strips the first 3 and the last
// 3 characters
void token_str3(Str &s) const
void token_str3(Allocator &al, Str &s) const
{
s.p = (char*) tok + 3;
s.n = cur-tok-6;
s.p = str_unescape_c(al, s);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, strips first 2 characters and last character
void token_raw_str(Str &s) const
{
s.p = (char*) tok + 2;
s.n = cur-tok-3;
}

// Return the current token as YYSTYPE::Str, strips the first 4 and the last
// 3 characters
void token_raw_str3(Str &s) const
{
s.p = (char*) tok + 4;
s.n = cur-tok-7;
}

// Return the current token as YYSTYPE::Str, strips first 2 and last character
void token_unicode_str(Allocator &al, Str &s) const
{
s.p = (char*) tok + 2;
s.n = cur-tok-3;
s.p = str_unescape_c(al, s);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, strips the first 4 and the last 3 chars
void token_unicode_str3(Allocator &al, Str &s) const
{
s.p = (char*) tok + 4;
s.n = cur-tok-7;
s.p = str_unescape_c(al, s);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, strips first 2 and last character
void token_fmt_str(Allocator &al, Str &s) const
{
s.p = (char*) tok + 2;
s.n = cur-tok-3;
s.p = str_unescape_c(al, s);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, strips the first 4 and the last 3 chars
void token_fmt_str3(Allocator &al, Str &s) const
{
s.p = (char*) tok + 4;
s.n = cur-tok-7;
s.p = str_unescape_c(al, s);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, strips first 3 and last character
void token_raw_fmt_str(Str &s) const
{
s.p = (char*) tok + 3;
s.n = cur-tok-4;
}

// Return the current token as YYSTYPE::Str, strips the first 5 and last 3 chars
void token_raw_fmt_str3(Str &s) const
{
s.p = (char*) tok + 5;
s.n = cur-tok-8;
}

// Return the current token as YYSTYPE::Str, replaces `"` with `'` and prepends 'b'
void token_bytes(Allocator &al, Str &s) const
{
s.p = (char*) tok + 2;
s.n = cur-tok-3;
std::string s_ = str_unescape_c(al, s);
s_ = "b'" + s_ + "'";
s.p = s2c(al, s_);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, replaces `"""` or `'''` with `'` and prepends 'b'
void token_bytes3(Allocator &al, Str &s) const
{
s.p = (char*) tok + 4;
s.n = cur-tok-7;
std::string s_ = str_unescape_c(al, s);
s_ = "b'" + s_ + "'";
s.p = s2c(al, s_);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, transforms the string to b'string'
void token_raw_bytes(Str &s) const
{
s.p = (char*) tok + 1;
s.n = cur-tok-1;
s.p[0] = 'b';
s.p[1] = '\'';
s.p[s.n - 1] = '\'';
}

// Return the current token as YYSTYPE::Str, transforms the string to b'string'
void token_raw_bytes3(Str &s) const
{
s.p = (char*) tok + 3;
s.n = cur-tok-5;
s.p[0] = 'b';
s.p[1] = '\'';
s.p[s.n - 1] = '\'';
}

// Return the current token's location
Expand Down
52 changes: 47 additions & 5 deletions src/lpython/parser/tokenizer.re
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,18 @@ int Tokenizer::lex(Allocator &al, YYSTYPE &yylval, Location &loc, diag::Diagnost
| ("''" | "''" "\\"+) [^'\x00\\]
| [^'\x00\\] )*
"'''";
raw_str1 = 'r' (string1 | string2);
raw_str2 = 'r' (string3 | string4);
unicode_str1 = 'u' (string1 | string2);
unicode_str2 = 'u' (string3 | string4);
fmt_str1 = 'f' (string1 | string2);
fmt_str2 = 'f' (string3 | string4);
raw_fmt_str1 = ('rf' | 'fr') (string1 | string2);
raw_fmt_str2 = ('rf' | 'fr') (string3 | string4);
bytes1 = 'b' (string1 | string2);
bytes2 = 'b' (string3 | string4);
raw_bytes1 = ('rb' | 'br') (string1 | string2);
raw_bytes2 = ('rb' | 'br') (string3 | string4);
type_ignore = "#" whitespace? "type:" whitespace? "ignore" [^\n\x00]*;
type_comment = "#" whitespace? "type:" whitespace? [^\n\x00]*;
comment = "#" [^\n\x00]*;
Expand Down Expand Up @@ -583,10 +595,28 @@ int Tokenizer::lex(Allocator &al, YYSTYPE &yylval, Location &loc, diag::Diagnost
}
//docstring { RET(TK_DOCSTRING) }
string1 { token_str(yylval.string); RET(TK_STRING) }
string2 { token_str(yylval.string); RET(TK_STRING) }
string3 { token_str3(yylval.string); RET(TK_STRING) }
string4 { token_str3(yylval.string); RET(TK_STRING) }
string1 { token_str(al, yylval.string); RET(TK_STRING) }
string2 { token_str(al, yylval.string); RET(TK_STRING) }
string3 { token_str3(al, yylval.string); RET(TK_STRING) }
string4 { token_str3(al, yylval.string); RET(TK_STRING) }
raw_str1 { token_raw_str(yylval.string); RET(TK_RAW_STRING) }
raw_str2 { token_raw_str3(yylval.string); RET(TK_RAW_STRING) }
unicode_str1 { token_unicode_str(al, yylval.string); RET(TK_UNI_STRING) }
unicode_str2 { token_unicode_str3(al, yylval.string); RET(TK_UNI_STRING) }
fmt_str1 { token_fmt_str(al, yylval.string); RET(TK_FMT_STRING) }
fmt_str2 { token_fmt_str3(al, yylval.string); RET(TK_FMT_STRING) }
raw_fmt_str1 { token_raw_fmt_str(yylval.string); RET(TK_RAW_FMT_STRING) }
raw_fmt_str2 { token_raw_fmt_str3(yylval.string); RET(TK_RAW_FMT_STRING) }
bytes1 { token_bytes(al, yylval.string); RET(TK_BYTES) }
bytes2 { token_bytes3(al, yylval.string); RET(TK_BYTES) }
raw_bytes1 { token_raw_bytes(yylval.string); RET(TK_RAW_BYTES) }
raw_bytes2 { token_raw_bytes3(yylval.string); RET(TK_RAW_BYTES) }
name { token(yylval.string); RET(TK_NAME) }
*/
Expand Down Expand Up @@ -687,6 +717,12 @@ std::string token2text(const int token)
T(TK_AT, "@")

T(TK_STRING, "string")
T(TK_RAW_STRING, "raw_str")
T(TK_UNI_STRING, "unicode_str")
T(TK_FMT_STRING, "fmt_string")
T(TK_RAW_FMT_STRING, "raw_fmt_string")
T(TK_BYTES, "bytes")
T(TK_RAW_BYTES, "raw_bytes")
T(TK_COMMENT, "comment")
T(TK_EOLCOMMENT, "eolcomment")
T(TK_TYPE_COMMENT, "type_comment")
Expand Down Expand Up @@ -824,7 +860,13 @@ std::string pickle_token(int token, const YYSTYPE &yystype)
t += " " + std::to_string(yystype.f);
} else if (token == yytokentype::TK_IMAG_NUM) {
t += " " + std::to_string(yystype.f) + "j";
} else if (token == yytokentype::TK_STRING) {
} else if (token == yytokentype::TK_STRING
|| token == yytokentype::TK_RAW_STRING
|| token == yytokentype::TK_UNI_STRING
|| token == yytokentype::TK_FMT_STRING
|| token == yytokentype::TK_RAW_FMT_STRING
|| token == yytokentype::TK_BYTES
|| token == yytokentype::TK_RAW_BYTES) {
t = t + " " + "\"" + str_escape_c(yystype.string.str()) + "\"";
} else if (token == yytokentype::TK_TYPE_COMMENT) {
t = t + " " + "\"" + yystype.string.str() + "\"";
Expand Down

0 comments on commit da3b5c0

Please sign in to comment.