1
1
lexer grammar RustLexer;
2
2
3
+ @lexer::members {
4
+ public boolean is_at(int pos) {
5
+ return _input.index() == pos;
6
+ }
7
+ }
8
+
9
+
3
10
tokens {
4
11
EQ, LT, LE, EQEQ, NE, GE, GT, ANDAND, OROR, NOT, TILDE, PLUT,
5
12
MINUS, STAR, SLASH, PERCENT, CARET, AND, OR, SHL, SHR, BINOP,
@@ -8,14 +15,10 @@ tokens {
8
15
LBRACE, RBRACE, POUND, DOLLAR, UNDERSCORE, LIT_CHAR,
9
16
LIT_INTEGER, LIT_FLOAT, LIT_STR, LIT_STR_RAW, LIT_BINARY,
10
17
LIT_BINARY_RAW, IDENT, LIFETIME, WHITESPACE, DOC_COMMENT,
11
- COMMENT
18
+ COMMENT, SHEBANG
12
19
}
13
20
14
- /* Note: due to antlr limitations, we can't represent XID_start and
15
- * XID_continue properly. ASCII-only substitute. */
16
-
17
- fragment XID_start : [_a-zA-Z ] ;
18
- fragment XID_continue : [_a-zA-Z0 -9] ;
21
+ import xidstart , xidcontinue;
19
22
20
23
21
24
/* Expression-operator symbols */
@@ -90,94 +93,63 @@ fragment CHAR_ESCAPE
90
93
| [xX] HEXIT HEXIT
91
94
| ' u' HEXIT HEXIT HEXIT HEXIT
92
95
| ' U ' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT
96
+ | ' u{' HEXIT ' } '
97
+ | ' u{' HEXIT HEXIT ' } '
98
+ | ' u{' HEXIT HEXIT HEXIT ' } '
99
+ | ' u{' HEXIT HEXIT HEXIT HEXIT ' } '
100
+ | ' u{' HEXIT HEXIT HEXIT HEXIT HEXIT ' } '
101
+ | ' u{' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT ' } '
93
102
;
94
103
95
104
fragment SUFFIX
96
105
: IDENT
97
106
;
98
107
108
+ fragment INTEGER_SUFFIX
109
+ : { _input.LA(1) != ' e' && _input.LA(1) != ' E ' }? SUFFIX
110
+ ;
111
+
99
112
LIT_CHAR
100
- : ' \' ' ( ' \\ ' CHAR_ESCAPE | ~[\\' \n\t\r ] ) ' \' ' SUFFIX ?
113
+ : ' \' ' ( ' \\ ' CHAR_ESCAPE
114
+ | ~[\\' \n\t\r ]
115
+ | ' \ud800' .. ' \udbff' ' \udc00' .. ' \udfff'
116
+ )
117
+ ' \' ' SUFFIX ?
101
118
;
102
119
103
120
LIT_BYTE
104
- : ' b\' ' ( ' \\ ' ( [xX] HEXIT HEXIT | [nrt\\' "0] ) | ~[\\ ' \n\t\r] ) ' \' ' SUFFIX ?
121
+ : ' b\' ' ( ' \\ ' ( [xX] HEXIT HEXIT
122
+ | [nrt\\' "0] )
123
+ | ~[\\ ' \n\t\r] ' \udc00 ' ..' \udfff ' ?
124
+ )
125
+ ' \' ' SUFFIX ?
105
126
;
106
127
107
128
LIT_INTEGER
108
- : [0-9][0-9_]* SUFFIX ?
109
- | ' 0b' [01][01_]* SUFFIX ?
110
- | ' 0o' [0-7][0-7_]* SUFFIX ?
111
- | ' 0x' [0-9a-fA-F ][0-9a-fA-F_ ]* SUFFIX ?
129
+
130
+ : [0-9][0-9_]* INTEGER_SUFFIX ?
131
+ | ' 0b' [01_]+ INTEGER_SUFFIX ?
132
+ | ' 0o' [0-7_]+ INTEGER_SUFFIX ?
133
+ | ' 0x' [0-9a-fA-F_ ]+ INTEGER_SUFFIX ?
112
134
;
113
135
114
136
LIT_FLOAT
115
137
: [0-9][0-9_]* (' .' {
116
- /* dot followed by another dot is a range, no float */
138
+ /* dot followed by another dot is a range, not a float */
117
139
_input.LA(1 ) != ' .' &&
118
- /* dot followed by an identifier is an integer with a function call, no float */
140
+ /* dot followed by an identifier is an integer with a function call, not a float */
119
141
_input.LA(1 ) != ' _' &&
120
- _input.LA(1 ) != ' a' &&
121
- _input.LA(1 ) != ' b' &&
122
- _input.LA(1 ) != ' c' &&
123
- _input.LA(1 ) != ' d' &&
124
- _input.LA(1 ) != ' e' &&
125
- _input.LA(1 ) != ' f' &&
126
- _input.LA(1 ) != ' g' &&
127
- _input.LA(1 ) != ' h' &&
128
- _input.LA(1 ) != ' i' &&
129
- _input.LA(1 ) != ' j' &&
130
- _input.LA(1 ) != ' k' &&
131
- _input.LA(1 ) != ' l' &&
132
- _input.LA(1 ) != ' m' &&
133
- _input.LA(1 ) != ' n' &&
134
- _input.LA(1 ) != ' o' &&
135
- _input.LA(1 ) != ' p' &&
136
- _input.LA(1 ) != ' q' &&
137
- _input.LA(1 ) != ' r' &&
138
- _input.LA(1 ) != ' s' &&
139
- _input.LA(1 ) != ' t' &&
140
- _input.LA(1 ) != ' u' &&
141
- _input.LA(1 ) != ' v' &&
142
- _input.LA(1 ) != ' w' &&
143
- _input.LA(1 ) != ' x' &&
144
- _input.LA(1 ) != ' y' &&
145
- _input.LA(1 ) != ' z' &&
146
- _input.LA(1 ) != ' A' &&
147
- _input.LA(1 ) != ' B' &&
148
- _input.LA(1 ) != ' C' &&
149
- _input.LA(1 ) != ' D' &&
150
- _input.LA(1 ) != ' E' &&
151
- _input.LA(1 ) != ' F' &&
152
- _input.LA(1 ) != ' G' &&
153
- _input.LA(1 ) != ' H' &&
154
- _input.LA(1 ) != ' I' &&
155
- _input.LA(1 ) != ' J' &&
156
- _input.LA(1 ) != ' K' &&
157
- _input.LA(1 ) != ' L' &&
158
- _input.LA(1 ) != ' M' &&
159
- _input.LA(1 ) != ' N' &&
160
- _input.LA(1 ) != ' O' &&
161
- _input.LA(1 ) != ' P' &&
162
- _input.LA(1 ) != ' Q' &&
163
- _input.LA(1 ) != ' R' &&
164
- _input.LA(1 ) != ' S' &&
165
- _input.LA(1 ) != ' T' &&
166
- _input.LA(1 ) != ' U' &&
167
- _input.LA(1 ) != ' V' &&
168
- _input.LA(1 ) != ' W' &&
169
- _input.LA(1 ) != ' X' &&
170
- _input.LA(1 ) != ' Y' &&
171
- _input.LA(1 ) != ' Z'
142
+ !(_input.LA(1 ) >= ' a' && _input.LA(1 ) <= ' z' ) &&
143
+ !(_input.LA(1 ) >= ' A' && _input.LA(1 ) <= ' Z' )
172
144
} ? | (' .' [0-9][0-9_]*)? ([eE] [-+]? [0-9][0-9_]*)? SUFFIX ?)
173
145
;
174
146
175
147
LIT_STR
176
148
: ' "' (' \\\n ' | ' \\\r\n ' | ' \\ ' CHAR_ESCAPE | .)*? ' "' SUFFIX ?
177
149
;
178
150
179
- LIT_BINARY : ' b' LIT_STR SUFFIX ? ;
180
- LIT_BINARY_RAW : ' rb ' LIT_STR_RAW SUFFIX ? ;
151
+ LIT_BINARY : ' b' LIT_STR ;
152
+ LIT_BINARY_RAW : ' b ' LIT_STR_RAW ;
181
153
182
154
/* this is a bit messy */
183
155
@@ -197,21 +169,27 @@ LIT_STR_RAW
197
169
198
170
QUESTION : ' ?' ;
199
171
200
- IDENT : XID_start XID_continue * ;
172
+ IDENT : XID_Start XID_Continue * ;
201
173
202
174
fragment QUESTION_IDENTIFIER : QUESTION ? IDENT ;
203
175
204
176
LIFETIME : ' \' ' IDENT ;
205
177
206
178
WHITESPACE : [ \r\n\t]+ ;
207
179
208
- UNDOC_COMMENT : ' ////' ~[\r\ n]* -> type(COMMENT ) ;
180
+ UNDOC_COMMENT : ' ////' ~[\n]* -> type(COMMENT ) ;
209
181
YESDOC_COMMENT : ' ///' ~[\r\n]* -> type(DOC_COMMENT ) ;
210
182
OUTER_DOC_COMMENT : ' //!' ~[\r\n]* -> type(DOC_COMMENT ) ;
211
- LINE_COMMENT : ' //' ~[\r \n]* -> type(COMMENT ) ;
183
+ LINE_COMMENT : ' //' ( ~[/\n] ~[ \n]* )? -> type(COMMENT ) ;
212
184
213
185
DOC_BLOCK_COMMENT
214
186
: (' /**' ~[*] | ' /*!' ) (DOC_BLOCK_COMMENT | .)*? ' */' -> type(DOC_COMMENT )
215
187
;
216
188
217
189
BLOCK_COMMENT : ' /*' (BLOCK_COMMENT | .)*? ' */' -> type(COMMENT ) ;
190
+
191
+ /* these appear at the beginning of a file */
192
+
193
+ SHEBANG : ' #!' { is_at(2 ) && _input.LA(1 ) != ' [' } ? ~[\r\n]* -> type(SHEBANG ) ;
194
+
195
+ UTF8_BOM : ' \ufeff ' { is_at(1 ) } ? -> skip ;
0 commit comments