Skip to content

Commit dfd584b

Browse files
committed
Support lexing multibyte unicode chars (poorly, but it works)
1 parent c91cfa0 commit dfd584b

File tree

4 files changed

+17
-51
lines changed

4 files changed

+17
-51
lines changed

lexer.l

+14-4
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,22 @@ static int saw_non_hash;
2828
%x doc_block
2929
%x suffix
3030

31+
ident [a-zA-Z\x80-\xff_][a-zA-Z0-9\x80-\xff_]*
32+
3133
%%
3234

33-
<suffix>[a-zA-Z_][a-zA-Z0-9_]* { BEGIN(INITIAL); }
34-
<suffix>(.|\n) { yyless(0); BEGIN(INITIAL); }
35+
<suffix>{ident} { BEGIN(INITIAL); }
36+
<suffix>(.|\n) { yyless(0); BEGIN(INITIAL); }
3537

3638
[ \n\t\r] { }
3739

40+
\xef\xbb\xbf {
41+
// UTF-8 byte order mark (BOM), ignore if in line 1, error otherwise
42+
if (yyget_lineno() != 1) {
43+
return -1;
44+
}
45+
}
46+
3847
\/\/(\/|\!) { BEGIN(doc_line); yymore(); }
3948
<doc_line>\n { BEGIN(INITIAL);
4049
yyleng--;
@@ -104,7 +113,7 @@ use { return USE; }
104113
where { return WHERE; }
105114
while { return WHILE; }
106115

107-
[a-zA-Z_][a-zA-Z0-9_]* { return IDENT; }
116+
{ident} { return IDENT; }
108117

109118
0x[0-9a-fA-F_]+ { BEGIN(suffix); return LIT_INTEGER; }
110119
0o[0-8_]+ { BEGIN(suffix); return LIT_INTEGER; }
@@ -172,11 +181,12 @@ while { return WHILE; }
172181

173182
\x27 { BEGIN(ltorchar); yymore(); }
174183
<ltorchar>static { BEGIN(INITIAL); return STATIC_LIFETIME; }
175-
<ltorchar>[a-zA-Z_][a-zA-Z0-9_]* { BEGIN(INITIAL); return LIFETIME; }
184+
<ltorchar>{ident} { BEGIN(INITIAL); return LIFETIME; }
176185
<ltorchar>\\[nrt\\\x27\x220]\x27 { BEGIN(suffix); return LIT_CHAR; }
177186
<ltorchar>\\x[0-9a-fA-F]{2}\x27 { BEGIN(suffix); return LIT_CHAR; }
178187
<ltorchar>\\u\{[0-9a-fA-F]?{6}\}\x27 { BEGIN(suffix); return LIT_CHAR; }
179188
<ltorchar>.\x27 { BEGIN(suffix); return LIT_CHAR; }
189+
<ltorchar>[\x80-\xff]{2,4}\x27 { BEGIN(suffix); return LIT_CHAR; }
180190
<ltorchar><<EOF>> { BEGIN(INITIAL); return -1; }
181191

182192
b\x22 { BEGIN(bytestr); yymore(); }

rlex.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ fn token_to_string(tok: token::Token) -> String {
3535
format!("Float({})", c.as_str().to_string())
3636
},
3737
Lit::Str_(s) => {
38-
format!("Str(\"{}\")", token::get_name(s).get().escape_default())
38+
format!("Str(\"{}\")", token::get_name(s).get())
3939
},
4040
Lit::StrRaw(s, n) => {
4141
format!("StrRaw(r{delim}\"{string}\"{delim})",

tokens.c

+1-43
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ extern char *yytext;
99

1010
static char *binop_text(char*);
1111
static char *desugar_num(char*, char*);
12-
static char *escape_string(char*);
1312

1413
void print_token(int token) {
1514
switch (token) {
@@ -58,7 +57,7 @@ void print_token(int token) {
5857
case LIT_CHAR: printf("Char(%s)", yytext); break;
5958
case LIT_INTEGER: printf("Integer(%s)", yytext); break;
6059
case LIT_FLOAT: printf("Float(%s)", yytext); break;
61-
case LIT_STR: printf("Str(%s)", escape_string(yytext)); break;
60+
case LIT_STR: printf("Str(%s)", yytext); break;
6261
case LIT_STR_RAW: printf("StrRaw(%s)", yytext); break;
6362
case LIT_BINARY: printf("Binary(%s)", yytext); break;
6463
case LIT_BINARY_RAW: printf("BinaryRaw(%s)", yytext); break;
@@ -225,44 +224,3 @@ static char *desugar_num(char *tok, char *default_suffix) {
225224
}
226225
return res;
227226
}
228-
229-
static char *escape_string(char *str) {
230-
int len = strlen(str);
231-
char *res = malloc(sizeof(char) * len * 2 + 1);
232-
int j = 0;
233-
res[j++] = '"';
234-
for (int i = 1; i < len - 1; ++i) {
235-
char c = str[i];
236-
switch (c) {
237-
case '\t':
238-
res[j++] = '\\';
239-
res[j++] = 't';
240-
break;
241-
case '\r':
242-
res[j++] = '\\';
243-
res[j++] = 'r';
244-
break;
245-
case '\n':
246-
res[j++] = '\\';
247-
res[j++] = 'n';
248-
break;
249-
case '\\':
250-
res[j++] = '\\';
251-
res[j++] = '\\';
252-
break;
253-
case '\'':
254-
res[j++] = '\\';
255-
res[j++] = '\'';
256-
break;
257-
case '\"':
258-
res[j++] = '\\';
259-
res[j++] = '\"';
260-
break;
261-
default:
262-
res[j++] = c;
263-
}
264-
}
265-
res[j++] = '"';
266-
res[j++] = '\0';
267-
return res;
268-
}

verify-lexer.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,7 @@ def compare(p):
3131
for f in filter(lambda p: p.endswith('.rs'), files):
3232
p = os.path.join(base, f)
3333
# compile-fail programs should be ignored
34-
# also, the lexer doesn't work with multibyte characters so
35-
# ignore programs that contain them
36-
if "compile-fail" in p or not all(ord(c) < 128 for c in open(p).read()):
34+
if "compile-fail" in p:
3735
print("skipping {}".format(p))
3836
continue
3937
print("comparing {}".format(p))

0 commit comments

Comments
 (0)