-
-
Notifications
You must be signed in to change notification settings - Fork 484
/
string.rs
203 lines (178 loc) · 8.23 KB
/
string.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
use std::cmp::max;
use oxc_allocator::String;
use super::{
cold_branch,
search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
Kind, Lexer, LexerContext, Span, Token,
};
use crate::diagnostics;
const MIN_ESCAPED_STR_LEN: usize = 16;
static DOUBLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| matches!(b, b'"' | b'\r' | b'\n' | b'\\'));
static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| matches!(b, b'\'' | b'\r' | b'\n' | b'\\'));
/// Macro to handle a string literal.
///
/// # SAFETY
/// `$delimiter` must be an ASCII byte.
/// Next char in `lexer.source` must be ASCII.
/// `$table` must be a `SafeByteMatchTable`.
/// `$table` must only match `$delimiter`, '\', '\r' or '\n'.
macro_rules! handle_string_literal {
($lexer:ident, $delimiter:expr, $table:ident) => {{
debug_assert!($delimiter.is_ascii());
if $lexer.context == LexerContext::JsxAttributeValue {
// SAFETY: Caller guarantees `$delimiter` is ASCII, and next char is ASCII
return $lexer.read_jsx_string_literal($delimiter);
}
// Skip opening quote.
// SAFETY: Caller guarantees next byte is ASCII, so safe to advance past it.
let after_opening_quote = $lexer.source.position().add(1);
// Consume bytes which are part of identifier
let next_byte = byte_search! {
lexer: $lexer,
table: $table,
start: after_opening_quote,
handle_eof: {
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
return Kind::Undetermined;
},
};
// Found a matching byte.
// Either end of string found, or a line break, or `\` escape.
match next_byte {
$delimiter => {
// SAFETY: Macro user guarantees delimiter is ASCII, so consuming it cannot move
// `lexer.source` off a UTF-8 character boundary.
$lexer.source.next_byte_unchecked();
Kind::Str
}
b'\\' => cold_branch(|| {
handle_string_literal_escape!($lexer, $delimiter, $table, after_opening_quote)
}),
_ => {
// Line break. This is impossible in valid JS, so cold path.
cold_branch(|| {
debug_assert!(matches!(next_byte, b'\r' | b'\n'));
$lexer.consume_char();
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
Kind::Undetermined
})
}
}
}};
}
macro_rules! handle_string_literal_escape {
($lexer:ident, $delimiter:expr, $table:ident, $after_opening_quote:ident) => {{
// Create arena string to hold unescaped string.
// We don't know how long string will end up being. Take a guess that total length
// will be double what we've seen so far, or `MIN_ESCAPED_STR_LEN` minimum.
let so_far = $lexer.source.str_from_pos_to_current($after_opening_quote);
let capacity = max(so_far.len() * 2, MIN_ESCAPED_STR_LEN);
let mut str = String::with_capacity_in(capacity, $lexer.allocator);
// Push chunk before `\` into `str`.
str.push_str(so_far);
'outer: loop {
// Consume `\`
let escape_start_offset = $lexer.offset();
$lexer.consume_char();
// Consume escape sequence and add char to `str`
let mut is_valid_escape_sequence = true;
$lexer.read_string_escape_sequence(&mut str, false, &mut is_valid_escape_sequence);
if !is_valid_escape_sequence {
let range = Span::new(escape_start_offset, $lexer.offset());
$lexer.error(diagnostics::invalid_escape_sequence(range));
}
// Consume bytes until reach end of string, line break, or another escape
let chunk_start = $lexer.source.position();
while let Some(b) = $lexer.peek_byte() {
match b {
b if !$table.matches(b) => {
// SAFETY: A byte is available, as we just peeked it.
// This may put `source`'s position on a UTF-8 continuation byte, which violates
// `Source`'s invariant temporarily, but the guarantees of `SafeByteMatchTable`
// mean `!table.matches(b)` on this branch prevents exiting this loop until
// `source` is positioned on a UTF-8 character boundary again.
$lexer.source.next_byte_unchecked();
continue;
}
b if b == $delimiter => {
// End of string found. Push last chunk to `str`.
let chunk = $lexer.source.str_from_pos_to_current(chunk_start);
str.push_str(chunk);
// Consume closing quote.
// SAFETY: Caller guarantees delimiter is ASCII, so consuming it cannot move
// `lexer.source` off a UTF-8 character boundary
$lexer.source.next_byte_unchecked();
break 'outer;
}
b'\\' => {
// Another escape found. Push last chunk to `str`, and loop back to handle escape.
let chunk = $lexer.source.str_from_pos_to_current(chunk_start);
str.push_str(chunk);
continue 'outer;
}
_ => {
// Line break. This is impossible in valid JS, so cold path.
return cold_branch(|| {
debug_assert!(matches!(b, b'\r' | b'\n'));
$lexer.consume_char();
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
Kind::Undetermined
});
}
}
}
// EOF
$lexer.error(diagnostics::unterminated_string($lexer.unterminated_range()));
return Kind::Undetermined;
}
// Convert `str` to arena slice and save to `escaped_strings`
$lexer.save_string(true, str.into_bump_str());
Kind::Str
}};
}
impl<'a> Lexer<'a> {
/// 12.9.4 String Literals
/// Read string literal delimited with `"`.
/// # SAFETY
/// Next character must be `"`.
pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind {
// SAFETY: Caller guarantees next char is `"`, which is ASCII.
// b'"' is an ASCII byte. `DOUBLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`.
unsafe { handle_string_literal!(self, b'"', DOUBLE_QUOTE_STRING_END_TABLE) }
}
/// Read string literal delimited with `'`.
/// # SAFETY
/// Next character must be `'`.
pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind {
// SAFETY: Caller guarantees next char is `'`, which is ASCII.
// b'\'' is an ASCII byte. `SINGLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`.
unsafe { handle_string_literal!(self, b'\'', SINGLE_QUOTE_STRING_END_TABLE) }
}
/// Save the string if it is escaped
/// This reduces the overall memory consumption while keeping the `Token` size small
/// Strings without escaped values can be retrieved as is from the token span
pub(super) fn save_string(&mut self, has_escape: bool, s: &'a str) {
if !has_escape {
return;
}
self.escaped_strings.insert(self.token.start, s);
self.token.escaped = true;
}
pub(crate) fn get_string(&self, token: Token) -> &'a str {
if token.escaped {
return self.escaped_strings[&token.start];
}
let raw = &self.source.whole()[token.start as usize..token.end as usize];
match token.kind {
Kind::Str => {
&raw[1..raw.len() - 1] // omit surrounding quotes
}
Kind::PrivateIdentifier => {
&raw[1..] // omit leading `#`
}
_ => raw,
}
}
}