Skip to content

Commit

Permalink
Refactor: Merge/Combine token_str() and related funcs
Browse files Browse the repository at this point in the history
  • Loading branch information
Shaikh-Ubaid committed Jun 13, 2023
1 parent d2a2f11 commit 20d1db9
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 125 deletions.
8 changes: 6 additions & 2 deletions src/libasr/string_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ std::string str_escape_c(const std::string &s) {
return o.str();
}

char* str_unescape_c(Allocator &al, LCompilers::Str &s) {
std::string str_unescape_c0(LCompilers::Str &s) {
std::string x = "";
size_t idx = 0;
for (; idx + 1 < s.size(); idx++) {
Expand Down Expand Up @@ -191,7 +191,11 @@ char* str_unescape_c(Allocator &al, LCompilers::Str &s) {
if (idx < s.size()) {
x += s[idx];
}
return LCompilers::s2c(al, x);
return x;
}

char* str_unescape_c(Allocator &al, LCompilers::Str &s) {
return LCompilers::s2c(al, str_unescape_c0(s));
}

std::string str_escape_fortran_double_quote(const std::string &s) {
Expand Down
1 change: 1 addition & 0 deletions src/libasr/string_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ std::string join_paths(const std::vector<std::string> &paths);
// Escapes special characters from the given string
// using C style escaping
std::string str_escape_c(const std::string &s);
std::string str_unescape_c0(LCompilers::Str &s);
char* str_unescape_c(Allocator &al, LCompilers::Str &s);

// Escapes double quote characters from the given string
Expand Down
128 changes: 21 additions & 107 deletions src/lpython/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,127 +54,41 @@ class Tokenizer
s.n = cur-tok;
}

// Return the current token as YYSTYPE::Str, strips first and last character
void token_str(Allocator &al, Str &s) const
// Return the current token as YYSTYPE::Str, strip the string appropirately
// based on the quotes it uses and unescape the string
void token_str(Allocator &al, Str &s, int quote_len, int prefix_len) const
{
s.p = (char*) tok + 1;
s.n = cur-tok-2;
s.p = (char*) tok + (prefix_len + quote_len);
s.n = cur-tok-(prefix_len + quote_len + quote_len);
s.p = str_unescape_c(al, s);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, strips the first 3 and the last
// 3 characters
void token_str3(Allocator &al, Str &s) const
// Return the current token as YYSTYPE::Str, strip the string appropirately
// based on the quotes it uses. It does not unescape the string
void token_raw_str(Str &s, int quote_len, int prefix_len) const
{
s.p = (char*) tok + 3;
s.n = cur-tok-6;
s.p = str_unescape_c(al, s);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, strips first 2 characters and last character
void token_raw_str(Str &s) const
{
s.p = (char*) tok + 2;
s.n = cur-tok-3;
}

// Return the current token as YYSTYPE::Str, strips the first 4 and the last
// 3 characters
void token_raw_str3(Str &s) const
{
s.p = (char*) tok + 4;
s.n = cur-tok-7;
}

// Return the current token as YYSTYPE::Str, strips first 2 and last character
void token_unicode_str(Allocator &al, Str &s) const
{
s.p = (char*) tok + 2;
s.n = cur-tok-3;
s.p = str_unescape_c(al, s);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, strips the first 4 and the last 3 chars
void token_unicode_str3(Allocator &al, Str &s) const
{
s.p = (char*) tok + 4;
s.n = cur-tok-7;
s.p = str_unescape_c(al, s);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, strips first 2 and last character
void token_fmt_str(Allocator &al, Str &s) const
{
s.p = (char*) tok + 2;
s.n = cur-tok-3;
s.p = str_unescape_c(al, s);
s.n = strlen(s.p);
s.p = (char*) tok + (prefix_len + quote_len);
s.n = cur-tok-(prefix_len + quote_len + quote_len);
}

// Return the current token as YYSTYPE::Str, strips the first 4 and the last 3 chars
void token_fmt_str3(Allocator &al, Str &s) const
// Return the current token as YYSTYPE::Str, strip the string appropriately,
// unescape the string and prepend 'b'
void token_bytes(Allocator &al, Str &s, int quote_len, int prefix_len) const
{
s.p = (char*) tok + 4;
s.n = cur-tok-7;
s.p = str_unescape_c(al, s);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, strips first 3 and last character
void token_raw_fmt_str(Str &s) const
{
s.p = (char*) tok + 3;
s.n = cur-tok-4;
}

// Return the current token as YYSTYPE::Str, strips the first 5 and last 3 chars
void token_raw_fmt_str3(Str &s) const
{
s.p = (char*) tok + 5;
s.n = cur-tok-8;
}

// Return the current token as YYSTYPE::Str, replaces `"` with `'` and prepends 'b'
void token_bytes(Allocator &al, Str &s) const
{
s.p = (char*) tok + 2;
s.n = cur-tok-3;
std::string s_ = str_unescape_c(al, s);
s_ = "b'" + s_ + "'";
s.p = (char*) tok + (prefix_len + quote_len);
s.n = cur-tok-(prefix_len + quote_len + quote_len);
std::string s_ = "b'" + str_unescape_c0(s) + "'";
s.p = s2c(al, s_);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, replaces `"""` or `'''` with `'` and prepends 'b'
void token_bytes3(Allocator &al, Str &s) const
{
s.p = (char*) tok + 4;
s.n = cur-tok-7;
std::string s_ = str_unescape_c(al, s);
s_ = "b'" + s_ + "'";
s.p = s2c(al, s_);
s.n = strlen(s.p);
}

// Return the current token as YYSTYPE::Str, transforms the string to b'string'
void token_raw_bytes(Str &s) const
{
s.p = (char*) tok + 1;
s.n = cur-tok-1;
s.p[0] = 'b';
s.p[1] = '\'';
s.p[s.n - 1] = '\'';
}

// Return the current token as YYSTYPE::Str, transforms the string to b'string'
void token_raw_bytes3(Str &s) const
// Return the current token as YYSTYPE::Str, strip the string appropriately
// and prepend 'b'. It does not unescape the string.
void token_raw_bytes(Str &s, int quote_len, int prefix_len) const
{
s.p = (char*) tok + 3;
s.n = cur-tok-5;
s.p = (char*) tok + (prefix_len + quote_len - 2);
s.n = cur-tok-(prefix_len + quote_len + quote_len - 3);
s.p[0] = 'b';
s.p[1] = '\'';
s.p[s.n - 1] = '\'';
Expand Down
32 changes: 16 additions & 16 deletions src/lpython/parser/tokenizer.re
Original file line number Diff line number Diff line change
Expand Up @@ -595,28 +595,28 @@ int Tokenizer::lex(Allocator &al, YYSTYPE &yylval, Location &loc, diag::Diagnost
}
//docstring { RET(TK_DOCSTRING) }
string1 { token_str(al, yylval.string); RET(TK_STRING) }
string2 { token_str(al, yylval.string); RET(TK_STRING) }
string3 { token_str3(al, yylval.string); RET(TK_STRING) }
string4 { token_str3(al, yylval.string); RET(TK_STRING) }
string1 { token_str(al, yylval.string, 1, 0); RET(TK_STRING) }
string2 { token_str(al, yylval.string, 1, 0); RET(TK_STRING) }
string3 { token_str(al, yylval.string, 3, 0); RET(TK_STRING) }
string4 { token_str(al, yylval.string, 3, 0); RET(TK_STRING) }
raw_str1 { token_raw_str(yylval.string); RET(TK_RAW_STRING) }
raw_str2 { token_raw_str3(yylval.string); RET(TK_RAW_STRING) }
raw_str1 { token_raw_str(yylval.string, 1, 1); RET(TK_RAW_STRING) }
raw_str2 { token_raw_str(yylval.string, 3, 1); RET(TK_RAW_STRING) }
unicode_str1 { token_unicode_str(al, yylval.string); RET(TK_UNI_STRING) }
unicode_str2 { token_unicode_str3(al, yylval.string); RET(TK_UNI_STRING) }
unicode_str1 { token_str(al, yylval.string, 1, 1); RET(TK_UNI_STRING) }
unicode_str2 { token_str(al, yylval.string, 3, 1); RET(TK_UNI_STRING) }
fmt_str1 { token_fmt_str(al, yylval.string); RET(TK_FMT_STRING) }
fmt_str2 { token_fmt_str3(al, yylval.string); RET(TK_FMT_STRING) }
fmt_str1 { token_str(al, yylval.string, 1, 1); RET(TK_FMT_STRING) }
fmt_str2 { token_str(al, yylval.string, 3, 1); RET(TK_FMT_STRING) }
raw_fmt_str1 { token_raw_fmt_str(yylval.string); RET(TK_RAW_FMT_STRING) }
raw_fmt_str2 { token_raw_fmt_str3(yylval.string); RET(TK_RAW_FMT_STRING) }
raw_fmt_str1 { token_raw_str(yylval.string, 1, 2); RET(TK_RAW_FMT_STRING) }
raw_fmt_str2 { token_raw_str(yylval.string, 3, 2); RET(TK_RAW_FMT_STRING) }
bytes1 { token_bytes(al, yylval.string); RET(TK_BYTES) }
bytes2 { token_bytes3(al, yylval.string); RET(TK_BYTES) }
bytes1 { token_bytes(al, yylval.string, 1, 1); RET(TK_BYTES) }
bytes2 { token_bytes(al, yylval.string, 3, 1); RET(TK_BYTES) }
raw_bytes1 { token_raw_bytes(yylval.string); RET(TK_RAW_BYTES) }
raw_bytes2 { token_raw_bytes3(yylval.string); RET(TK_RAW_BYTES) }
raw_bytes1 { token_raw_bytes(yylval.string, 1, 2); RET(TK_RAW_BYTES) }
raw_bytes2 { token_raw_bytes(yylval.string, 3, 2); RET(TK_RAW_BYTES) }
name { token(yylval.string); RET(TK_NAME) }
*/
Expand Down

0 comments on commit 20d1db9

Please sign in to comment.