-
-
Notifications
You must be signed in to change notification settings - Fork 482
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(regular_expression): Intro
ConstructorParser
(and `LiteralParse…
…r`) to handle escape sequence in RegExp('pat') (#6635) Preparation for #6141 `oxc_regular_expression` can already parse and validate both `/regexp-literal/` and `new RegExp("string-literal")`. But one thing that is not well-supported was reporting `Span` for the `RegExp("string-literal-with-\\escape")` case. For example, these two cases produce the same `RegExp` instances in JavaScript: - `/\d+/` - `new RegExp("\\d+")` For now, mainly in `oxc_linter`, the latter case is parsed with `oxc_parser` -> `ast::literal::StringLiteral` AST node -> `value` property. At this point, escape sequences are resolved(!), `oxc_regular_expression` can handle aligned `&str` as an argument without any problem in both cases. However, in terms of `Span` representation, these cases should be handled differently because of the `\\` in string literals... As a result, the parsed AST's `Span` for `new RegExp("string-literal")` is not accurate if it contains escape sequences. e.g. https://github.com/oxc-project/oxc/blob/a01a5dfdafb9cd536cb87867697e3ae43b1990e6/crates/oxc_linter/src/snapshots/no_invalid_regexp.snap#L118-L122 Each time the `\` appears, the subsequent position is shifted. `_` should be placed under `*` in this case. So... to resolve this issue, we need to implement `string_literal_parser` first, and use them as reading units of `oxc_regular_expression`.
- Loading branch information
Showing
25 changed files
with
4,172 additions
and
2,879 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,33 +1,7 @@ | ||
#[derive(Clone, Copy, Debug, Default)] | ||
pub struct ParserOptions { | ||
/// Used to adjust Span positions to fit the global source code. | ||
pub span_offset: u32, | ||
/// Unicode mode(`u` or `v` flag) enabled or not. | ||
pub unicode_mode: bool, | ||
/// Extended Unicode mode(`v` flag) enabled or not. | ||
pub unicode_sets_mode: bool, | ||
// TODO: Add `handle_escape_with_quote_type` like option to support `new RegExp("with \"escape\"")` | ||
} | ||
|
||
impl ParserOptions { | ||
#[must_use] | ||
pub fn with_span_offset(self, span_offset: u32) -> Self { | ||
ParserOptions { span_offset, ..self } | ||
} | ||
|
||
#[must_use] | ||
pub fn with_flags(self, flags: &str) -> Self { | ||
let (mut unicode_mode, mut unicode_sets_mode) = (false, false); | ||
for ch in flags.chars() { | ||
if ch == 'u' { | ||
unicode_mode = true; | ||
} | ||
if ch == 'v' { | ||
unicode_mode = true; | ||
unicode_sets_mode = true; | ||
} | ||
} | ||
|
||
ParserOptions { unicode_mode, unicode_sets_mode, ..self } | ||
} | ||
pub struct Options { | ||
/// Used to adjust `Span` positions to fit the global source code. | ||
pub pattern_span_offset: u32, | ||
/// Used to adjust `Span` positions to fit the global source code. | ||
pub flags_span_offset: u32, | ||
} |
119 changes: 119 additions & 0 deletions
119
crates/oxc_regular_expression/src/parser/flags_parser.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
use oxc_diagnostics::Result; | ||
use rustc_hash::FxHashSet; | ||
|
||
use crate::{ | ||
diagnostics, | ||
parser::{reader::Reader, span_factory::SpanFactory}, | ||
}; | ||
|
||
pub struct FlagsParser<'a> { | ||
reader: Reader<'a>, | ||
span_factory: SpanFactory, | ||
} | ||
|
||
impl<'a> FlagsParser<'a> { | ||
pub fn new(reader: Reader<'a>, span_offset: u32) -> Self { | ||
Self { reader, span_factory: SpanFactory::new(span_offset) } | ||
} | ||
|
||
/// Returns: (is_unicode_mode, is_unicode_sets_mode) | ||
pub fn parse(mut self) -> Result<(bool, bool)> { | ||
let mut is_unicode_mode = false; | ||
let mut is_unicode_sets_mode = false; | ||
let mut unique_flags = FxHashSet::default(); | ||
|
||
while let Some(cp) = self.reader.peek() { | ||
let span_start = self.reader.offset(); | ||
self.reader.advance(); | ||
let span_end = self.reader.offset(); | ||
|
||
if unique_flags.contains(&cp) { | ||
return Err(diagnostics::duplicated_flags( | ||
self.span_factory.create(span_start, span_end), | ||
&self.reader.atom(span_start, span_end), | ||
)); | ||
} | ||
if char::try_from(cp) | ||
.map_or(true, |c| !matches!(c, 'd' | 'g' | 'i' | 'm' | 's' | 'u' | 'v' | 'y')) | ||
{ | ||
return Err(diagnostics::unknown_flag( | ||
self.span_factory.create(span_start, span_end), | ||
&self.reader.atom(span_start, span_end), | ||
)); | ||
} | ||
|
||
if cp == 'u' as u32 { | ||
if unique_flags.contains(&('v' as u32)) { | ||
return Err(diagnostics::invalid_unicode_flags( | ||
self.span_factory.create(span_start, span_end), | ||
)); | ||
} | ||
is_unicode_mode = true; | ||
} | ||
if cp == 'v' as u32 { | ||
if unique_flags.contains(&('u' as u32)) { | ||
return Err(diagnostics::invalid_unicode_flags( | ||
self.span_factory.create(span_start, span_end), | ||
)); | ||
} | ||
is_unicode_mode = true; | ||
is_unicode_sets_mode = true; | ||
} | ||
|
||
unique_flags.insert(cp); | ||
} | ||
|
||
Ok((is_unicode_mode, is_unicode_sets_mode)) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use super::*; | ||
|
||
#[test] | ||
fn should_pass() { | ||
for (flags_text, expected) in &[ | ||
("", (false, false)), | ||
("i", (false, false)), | ||
("u", (true, false)), | ||
("v", (true, true)), | ||
("vg", (true, true)), | ||
] { | ||
let reader = Reader::initialize(flags_text, true, false).unwrap(); | ||
let result = FlagsParser::new(reader, 0).parse().unwrap(); | ||
assert_eq!(result, *expected); | ||
} | ||
} | ||
|
||
#[test] | ||
fn should_fail() { | ||
for flags_text in &["uv", "vu", "uu", "vv", "gg", "$"] { | ||
let reader = Reader::initialize(flags_text, true, false).unwrap(); | ||
let err = FlagsParser::new(reader, 0).parse(); | ||
assert!(err.is_err()); | ||
// println!("{:?}", err.unwrap_err().with_source_code(*flags_text)); | ||
} | ||
for flags_text in &[r#""uv""#, "'V'", "\"-\"", r#""\162""#] { | ||
let reader = Reader::initialize(flags_text, true, true).unwrap(); | ||
let err = FlagsParser::new(reader, 0).parse(); | ||
assert!(err.is_err()); | ||
// println!("{:?}", err.unwrap_err().with_source_code(*flags_text)); | ||
} | ||
} | ||
|
||
#[test] | ||
fn string_literal() { | ||
for reader in [ | ||
Reader::initialize("u", true, false).unwrap(), | ||
Reader::initialize("'u'", true, true).unwrap(), | ||
Reader::initialize(r#""\165""#, true, true).unwrap(), | ||
Reader::initialize(r#""\x75""#, true, true).unwrap(), | ||
Reader::initialize(r#""\u0075""#, true, true).unwrap(), | ||
Reader::initialize(r#""\u{0075}""#, true, true).unwrap(), | ||
] { | ||
let result = FlagsParser::new(reader, 0).parse().unwrap(); | ||
assert_eq!(result, (true, false)); | ||
} | ||
} | ||
} |
Oops, something went wrong.