Refactor: Merge/Combine token_str() and related funcs

lcompilers · Jun 13, 2023 · 20d1db9 · 20d1db9
1 parent d2a2f11
commit 20d1db9
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 125 deletions.
diff --git a/src/libasr/string_utils.cpp b/src/libasr/string_utils.cpp
@@ -154,7 +154,7 @@ std::string str_escape_c(const std::string &s) {
  return o.str();
 }
 
-char* str_unescape_c(Allocator &al, LCompilers::Str &s) {
+std::string str_unescape_c0(LCompilers::Str &s) {
  std::string x = "";
  size_t idx = 0;
  for (; idx + 1 < s.size(); idx++) {
@@ -191,7 +191,11 @@ char* str_unescape_c(Allocator &al, LCompilers::Str &s) {
  if (idx < s.size()) {
  x += s[idx];
  }
- return LCompilers::s2c(al, x);
+ return x;
+}
+
+char* str_unescape_c(Allocator &al, LCompilers::Str &s) {
+ return LCompilers::s2c(al, str_unescape_c0(s));
 }
 
 std::string str_escape_fortran_double_quote(const std::string &s) {

diff --git a/src/libasr/string_utils.h b/src/libasr/string_utils.h
@@ -37,6 +37,7 @@ std::string join_paths(const std::vector<std::string> &paths);
 // Escapes special characters from the given string
 // using C style escaping
 std::string str_escape_c(const std::string &s);
+std::string str_unescape_c0(LCompilers::Str &s);
 char* str_unescape_c(Allocator &al, LCompilers::Str &s);
 
 // Escapes double quote characters from the given string

diff --git a/src/lpython/parser/tokenizer.h b/src/lpython/parser/tokenizer.h
@@ -54,127 +54,41 @@ class Tokenizer
  s.n = cur-tok;
  }
 
- // Return the current token as YYSTYPE::Str, strips first and last character
- void token_str(Allocator &al, Str &s) const
+ // Return the current token as YYSTYPE::Str, strip the string appropirately
+ // based on the quotes it uses and unescape the string
+ void token_str(Allocator &al, Str &s, int quote_len, int prefix_len) const
  {
- s.p = (char*) tok + 1;
- s.n = cur-tok-2;
+ s.p = (char*) tok + (prefix_len + quote_len);
+ s.n = cur-tok-(prefix_len + quote_len + quote_len);
  s.p = str_unescape_c(al, s);
  s.n = strlen(s.p);
  }
 
- // Return the current token as YYSTYPE::Str, strips the first 3 and the last
- // 3 characters
- void token_str3(Allocator &al, Str &s) const
+ // Return the current token as YYSTYPE::Str, strip the string appropirately
+ // based on the quotes it uses. It does not unescape the string
+ void token_raw_str(Str &s, int quote_len, int prefix_len) const
  {
- s.p = (char*) tok + 3;
- s.n = cur-tok-6;
- s.p = str_unescape_c(al, s);
- s.n = strlen(s.p);
- }
-
- // Return the current token as YYSTYPE::Str, strips first 2 characters and last character
- void token_raw_str(Str &s) const
- {
- s.p = (char*) tok + 2;
- s.n = cur-tok-3;
- }
-
- // Return the current token as YYSTYPE::Str, strips the first 4 and the last
- // 3 characters
- void token_raw_str3(Str &s) const
- {
- s.p = (char*) tok + 4;
- s.n = cur-tok-7;
- }
-
- // Return the current token as YYSTYPE::Str, strips first 2 and last character
- void token_unicode_str(Allocator &al, Str &s) const
- {
- s.p = (char*) tok + 2;
- s.n = cur-tok-3;
- s.p = str_unescape_c(al, s);
- s.n = strlen(s.p);
- }
-
- // Return the current token as YYSTYPE::Str, strips the first 4 and the last 3 chars
- void token_unicode_str3(Allocator &al, Str &s) const
- {
- s.p = (char*) tok + 4;
- s.n = cur-tok-7;
- s.p = str_unescape_c(al, s);
- s.n = strlen(s.p);
- }
-
- // Return the current token as YYSTYPE::Str, strips first 2 and last character
- void token_fmt_str(Allocator &al, Str &s) const
- {
- s.p = (char*) tok + 2;
- s.n = cur-tok-3;
- s.p = str_unescape_c(al, s);
- s.n = strlen(s.p);
+ s.p = (char*) tok + (prefix_len + quote_len);
+ s.n = cur-tok-(prefix_len + quote_len + quote_len);
  }
 
- // Return the current token as YYSTYPE::Str, strips the first 4 and the last 3 chars
- void token_fmt_str3(Allocator &al, Str &s) const
+ // Return the current token as YYSTYPE::Str, strip the string appropriately,
+ // unescape the string and prepend 'b'
+ void token_bytes(Allocator &al, Str &s, int quote_len, int prefix_len) const
  {
- s.p = (char*) tok + 4;
- s.n = cur-tok-7;
- s.p = str_unescape_c(al, s);
- s.n = strlen(s.p);
- }
-
- // Return the current token as YYSTYPE::Str, strips first 3 and last character
- void token_raw_fmt_str(Str &s) const
- {
- s.p = (char*) tok + 3;
- s.n = cur-tok-4;
- }
-
- // Return the current token as YYSTYPE::Str, strips the first 5 and last 3 chars
- void token_raw_fmt_str3(Str &s) const
- {
- s.p = (char*) tok + 5;
- s.n = cur-tok-8;
- }
-
- // Return the current token as YYSTYPE::Str, replaces `"` with `'` and prepends 'b'
- void token_bytes(Allocator &al, Str &s) const
- {
- s.p = (char*) tok + 2;
- s.n = cur-tok-3;
- std::string s_ = str_unescape_c(al, s);
- s_ = "b'" + s_ + "'";
+ s.p = (char*) tok + (prefix_len + quote_len);
+ s.n = cur-tok-(prefix_len + quote_len + quote_len);
+ std::string s_ = "b'" + str_unescape_c0(s) + "'";
  s.p = s2c(al, s_);
  s.n = strlen(s.p);
  }
 
- // Return the current token as YYSTYPE::Str, replaces `"""` or `'''` with `'` and prepends 'b'
- void token_bytes3(Allocator &al, Str &s) const
- {
- s.p = (char*) tok + 4;
- s.n = cur-tok-7;
- std::string s_ = str_unescape_c(al, s);
- s_ = "b'" + s_ + "'";
- s.p = s2c(al, s_);
- s.n = strlen(s.p);
- }
-
- // Return the current token as YYSTYPE::Str, transforms the string to b'string'
- void token_raw_bytes(Str &s) const
- {
- s.p = (char*) tok + 1;
- s.n = cur-tok-1;
- s.p[0] = 'b';
- s.p[1] = '\'';
- s.p[s.n - 1] = '\'';
- }
-
- // Return the current token as YYSTYPE::Str, transforms the string to b'string'
- void token_raw_bytes3(Str &s) const
+ // Return the current token as YYSTYPE::Str, strip the string appropriately
+ // and prepend 'b'. It does not unescape the string.
+ void token_raw_bytes(Str &s, int quote_len, int prefix_len) const
  {
- s.p = (char*) tok + 3;
- s.n = cur-tok-5;
+ s.p = (char*) tok + (prefix_len + quote_len - 2);
+ s.n = cur-tok-(prefix_len + quote_len + quote_len - 3);
  s.p[0] = 'b';
  s.p[1] = '\'';
  s.p[s.n - 1] = '\'';

diff --git a/src/lpython/parser/tokenizer.re b/src/lpython/parser/tokenizer.re
@@ -595,28 +595,28 @@ int Tokenizer::lex(Allocator &al, YYSTYPE &yylval, Location &loc, diag::Diagnost
  }
  //docstring { RET(TK_DOCSTRING) }
 
- string1 { token_str(al, yylval.string); RET(TK_STRING) }
- string2 { token_str(al, yylval.string); RET(TK_STRING) }
- string3 { token_str3(al, yylval.string); RET(TK_STRING) }
- string4 { token_str3(al, yylval.string); RET(TK_STRING) }
+ string1 { token_str(al, yylval.string, 1, 0); RET(TK_STRING) }
+ string2 { token_str(al, yylval.string, 1, 0); RET(TK_STRING) }
+ string3 { token_str(al, yylval.string, 3, 0); RET(TK_STRING) }
+ string4 { token_str(al, yylval.string, 3, 0); RET(TK_STRING) }
 
- raw_str1 { token_raw_str(yylval.string); RET(TK_RAW_STRING) }
- raw_str2 { token_raw_str3(yylval.string); RET(TK_RAW_STRING) }
+ raw_str1 { token_raw_str(yylval.string, 1, 1); RET(TK_RAW_STRING) }
+ raw_str2 { token_raw_str(yylval.string, 3, 1); RET(TK_RAW_STRING) }
 
- unicode_str1 { token_unicode_str(al, yylval.string); RET(TK_UNI_STRING) }
- unicode_str2 { token_unicode_str3(al, yylval.string); RET(TK_UNI_STRING) }
+ unicode_str1 { token_str(al, yylval.string, 1, 1); RET(TK_UNI_STRING) }
+ unicode_str2 { token_str(al, yylval.string, 3, 1); RET(TK_UNI_STRING) }
 
- fmt_str1 { token_fmt_str(al, yylval.string); RET(TK_FMT_STRING) }
- fmt_str2 { token_fmt_str3(al, yylval.string); RET(TK_FMT_STRING) }
+ fmt_str1 { token_str(al, yylval.string, 1, 1); RET(TK_FMT_STRING) }
+ fmt_str2 { token_str(al, yylval.string, 3, 1); RET(TK_FMT_STRING) }
 
- raw_fmt_str1 { token_raw_fmt_str(yylval.string); RET(TK_RAW_FMT_STRING) }
- raw_fmt_str2 { token_raw_fmt_str3(yylval.string); RET(TK_RAW_FMT_STRING) }
+ raw_fmt_str1 { token_raw_str(yylval.string, 1, 2); RET(TK_RAW_FMT_STRING) }
+ raw_fmt_str2 { token_raw_str(yylval.string, 3, 2); RET(TK_RAW_FMT_STRING) }
 
- bytes1 { token_bytes(al, yylval.string); RET(TK_BYTES) }
- bytes2 { token_bytes3(al, yylval.string); RET(TK_BYTES) }
+ bytes1 { token_bytes(al, yylval.string, 1, 1); RET(TK_BYTES) }
+ bytes2 { token_bytes(al, yylval.string, 3, 1); RET(TK_BYTES) }
 
- raw_bytes1 { token_raw_bytes(yylval.string); RET(TK_RAW_BYTES) }
- raw_bytes2 { token_raw_bytes3(yylval.string); RET(TK_RAW_BYTES) }
+ raw_bytes1 { token_raw_bytes(yylval.string, 1, 2); RET(TK_RAW_BYTES) }
+ raw_bytes2 { token_raw_bytes(yylval.string, 3, 2); RET(TK_RAW_BYTES) }
 
  name { token(yylval.string); RET(TK_NAME) }
  */