Skip to content

Commit 7397bdc

Browse files
committed
Auto merge of #24620 - pczarn:model-lexer-issues, r=cmr
Fixes #15679 Fixes #15878 Fixes #15882 Closes #15883
2 parents 77acda1 + 13bc8af commit 7397bdc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+982
-179
lines changed

src/grammar/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ javac *.java
1212
rustc -O verify.rs
1313
for file in ../*/**.rs; do
1414
echo $file;
15-
grun RustLexer tokens -tokens < $file | ./verify $file RustLexer.tokens || break
15+
grun RustLexer tokens -tokens < "$file" | ./verify "$file" RustLexer.tokens || break
1616
done
1717
```
1818

src/grammar/RustLexer.g4

+49-71
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
lexer grammar RustLexer;
22

3+
@lexer::members {
4+
public boolean is_at(int pos) {
5+
return _input.index() == pos;
6+
}
7+
}
8+
9+
310
tokens {
411
EQ, LT, LE, EQEQ, NE, GE, GT, ANDAND, OROR, NOT, TILDE, PLUT,
512
MINUS, STAR, SLASH, PERCENT, CARET, AND, OR, SHL, SHR, BINOP,
@@ -8,14 +15,10 @@ tokens {
815
LBRACE, RBRACE, POUND, DOLLAR, UNDERSCORE, LIT_CHAR,
916
LIT_INTEGER, LIT_FLOAT, LIT_STR, LIT_STR_RAW, LIT_BINARY,
1017
LIT_BINARY_RAW, IDENT, LIFETIME, WHITESPACE, DOC_COMMENT,
11-
COMMENT
18+
COMMENT, SHEBANG
1219
}
1320

14-
/* Note: due to antlr limitations, we can't represent XID_start and
15-
* XID_continue properly. ASCII-only substitute. */
16-
17-
fragment XID_start : [_a-zA-Z] ;
18-
fragment XID_continue : [_a-zA-Z0-9] ;
21+
import xidstart , xidcontinue;
1922

2023

2124
/* Expression-operator symbols */
@@ -90,94 +93,63 @@ fragment CHAR_ESCAPE
9093
| [xX] HEXIT HEXIT
9194
| 'u' HEXIT HEXIT HEXIT HEXIT
9295
| 'U' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT
96+
| 'u{' HEXIT '}'
97+
| 'u{' HEXIT HEXIT '}'
98+
| 'u{' HEXIT HEXIT HEXIT '}'
99+
| 'u{' HEXIT HEXIT HEXIT HEXIT '}'
100+
| 'u{' HEXIT HEXIT HEXIT HEXIT HEXIT '}'
101+
| 'u{' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT '}'
93102
;
94103
95104
fragment SUFFIX
96105
: IDENT
97106
;
98107
108+
fragment INTEGER_SUFFIX
109+
: { _input.LA(1) != 'e' && _input.LA(1) != 'E' }? SUFFIX
110+
;
111+
99112
LIT_CHAR
100-
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] ) '\'' SUFFIX?
113+
: '\'' ( '\\' CHAR_ESCAPE
114+
| ~[\\'\n\t\r]
115+
| '\ud800' .. '\udbff' '\udc00' .. '\udfff'
116+
)
117+
'\'' SUFFIX?
101118
;
102119

103120
LIT_BYTE
104-
: 'b\'' ( '\\' ( [xX] HEXIT HEXIT | [nrt\\'"0] ) | ~[\\'\n\t\r] ) '\'' SUFFIX?
121+
: 'b\'' ( '\\' ( [xX] HEXIT HEXIT
122+
| [nrt\\'"0] )
123+
| ~[\\'\n\t\r] '\udc00'..'\udfff'?
124+
)
125+
'\'' SUFFIX?
105126
;
106127

107128
LIT_INTEGER
108-
: [0-9][0-9_]* SUFFIX?
109-
| '0b' [01][01_]* SUFFIX?
110-
| '0o' [0-7][0-7_]* SUFFIX?
111-
| '0x' [0-9a-fA-F][0-9a-fA-F_]* SUFFIX?
129+
130+
: [0-9][0-9_]* INTEGER_SUFFIX?
131+
| '0b' [01_]+ INTEGER_SUFFIX?
132+
| '0o' [0-7_]+ INTEGER_SUFFIX?
133+
| '0x' [0-9a-fA-F_]+ INTEGER_SUFFIX?
112134
;
113135

114136
LIT_FLOAT
115137
: [0-9][0-9_]* ('.' {
116-
/* dot followed by another dot is a range, no float */
138+
/* dot followed by another dot is a range, not a float */
117139
_input.LA(1) != '.' &&
118-
/* dot followed by an identifier is an integer with a function call, no float */
140+
/* dot followed by an identifier is an integer with a function call, not a float */
119141
_input.LA(1) != '_' &&
120-
_input.LA(1) != 'a' &&
121-
_input.LA(1) != 'b' &&
122-
_input.LA(1) != 'c' &&
123-
_input.LA(1) != 'd' &&
124-
_input.LA(1) != 'e' &&
125-
_input.LA(1) != 'f' &&
126-
_input.LA(1) != 'g' &&
127-
_input.LA(1) != 'h' &&
128-
_input.LA(1) != 'i' &&
129-
_input.LA(1) != 'j' &&
130-
_input.LA(1) != 'k' &&
131-
_input.LA(1) != 'l' &&
132-
_input.LA(1) != 'm' &&
133-
_input.LA(1) != 'n' &&
134-
_input.LA(1) != 'o' &&
135-
_input.LA(1) != 'p' &&
136-
_input.LA(1) != 'q' &&
137-
_input.LA(1) != 'r' &&
138-
_input.LA(1) != 's' &&
139-
_input.LA(1) != 't' &&
140-
_input.LA(1) != 'u' &&
141-
_input.LA(1) != 'v' &&
142-
_input.LA(1) != 'w' &&
143-
_input.LA(1) != 'x' &&
144-
_input.LA(1) != 'y' &&
145-
_input.LA(1) != 'z' &&
146-
_input.LA(1) != 'A' &&
147-
_input.LA(1) != 'B' &&
148-
_input.LA(1) != 'C' &&
149-
_input.LA(1) != 'D' &&
150-
_input.LA(1) != 'E' &&
151-
_input.LA(1) != 'F' &&
152-
_input.LA(1) != 'G' &&
153-
_input.LA(1) != 'H' &&
154-
_input.LA(1) != 'I' &&
155-
_input.LA(1) != 'J' &&
156-
_input.LA(1) != 'K' &&
157-
_input.LA(1) != 'L' &&
158-
_input.LA(1) != 'M' &&
159-
_input.LA(1) != 'N' &&
160-
_input.LA(1) != 'O' &&
161-
_input.LA(1) != 'P' &&
162-
_input.LA(1) != 'Q' &&
163-
_input.LA(1) != 'R' &&
164-
_input.LA(1) != 'S' &&
165-
_input.LA(1) != 'T' &&
166-
_input.LA(1) != 'U' &&
167-
_input.LA(1) != 'V' &&
168-
_input.LA(1) != 'W' &&
169-
_input.LA(1) != 'X' &&
170-
_input.LA(1) != 'Y' &&
171-
_input.LA(1) != 'Z'
142+
!(_input.LA(1) >= 'a' && _input.LA(1) <= 'z') &&
143+
!(_input.LA(1) >= 'A' && _input.LA(1) <= 'Z')
172144
}? | ('.' [0-9][0-9_]*)? ([eE] [-+]? [0-9][0-9_]*)? SUFFIX?)
173145
;
174146

175147
LIT_STR
176148
: '"' ('\\\n' | '\\\r\n' | '\\' CHAR_ESCAPE | .)*? '"' SUFFIX?
177149
;
178150

179-
LIT_BINARY : 'b' LIT_STR SUFFIX?;
180-
LIT_BINARY_RAW : 'rb' LIT_STR_RAW SUFFIX?;
151+
LIT_BINARY : 'b' LIT_STR ;
152+
LIT_BINARY_RAW : 'b' LIT_STR_RAW ;
181153

182154
/* this is a bit messy */
183155

@@ -197,21 +169,27 @@ LIT_STR_RAW
197169

198170
QUESTION : '?';
199171

200-
IDENT : XID_start XID_continue* ;
172+
IDENT : XID_Start XID_Continue* ;
201173

202174
fragment QUESTION_IDENTIFIER : QUESTION? IDENT;
203175

204176
LIFETIME : '\'' IDENT ;
205177

206178
WHITESPACE : [ \r\n\t]+ ;
207179

208-
UNDOC_COMMENT : '////' ~[\r\n]* -> type(COMMENT) ;
180+
UNDOC_COMMENT : '////' ~[\n]* -> type(COMMENT) ;
209181
YESDOC_COMMENT : '///' ~[\r\n]* -> type(DOC_COMMENT) ;
210182
OUTER_DOC_COMMENT : '//!' ~[\r\n]* -> type(DOC_COMMENT) ;
211-
LINE_COMMENT : '//' ~[\r\n]* -> type(COMMENT) ;
183+
LINE_COMMENT : '//' ( ~[/\n] ~[\n]* )? -> type(COMMENT) ;
212184

213185
DOC_BLOCK_COMMENT
214186
: ('/**' ~[*] | '/*!') (DOC_BLOCK_COMMENT | .)*? '*/' -> type(DOC_COMMENT)
215187
;
216188

217189
BLOCK_COMMENT : '/*' (BLOCK_COMMENT | .)*? '*/' -> type(COMMENT) ;
190+
191+
/* these appear at the beginning of a file */
192+
193+
SHEBANG : '#!' { is_at(2) && _input.LA(1) != '[' }? ~[\r\n]* -> type(SHEBANG) ;
194+
195+
UTF8_BOM : '\ufeff' { is_at(1) }? -> skip ;

src/grammar/check.sh

+4-4
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@ failed=0
1818
skipped=0
1919

2020
check() {
21-
grep --silent "// ignore-lexer-test" $1;
21+
grep --silent "// ignore-lexer-test" "$1";
2222

2323
# if it's *not* found...
2424
if [ $? -eq 1 ]; then
2525
cd $2 # This `cd` is so java will pick up RustLexer.class. I couldn't
26-
# figure out how to wrangle the CLASSPATH, just adding build/grammr didn't
27-
# seem to have anny effect.
26+
# figure out how to wrangle the CLASSPATH, just adding build/grammar
27+
# didn't seem to have any effect.
2828
if $3 RustLexer tokens -tokens < $1 | $4 $1 $5; then
2929
echo "pass: $1"
3030
passed=`expr $passed + 1`
@@ -39,7 +39,7 @@ check() {
3939
}
4040

4141
for file in $(find $1 -iname '*.rs' ! -path '*/test/compile-fail*'); do
42-
check $file $2 $3 $4 $5
42+
check "$file" $2 $3 $4 $5
4343
done
4444

4545
printf "\ntest result: "

0 commit comments

Comments
 (0)