feat(regular_expression): Intro ConstructorParser(and `LiteralParse…

…r`) to handle escape sequence in RegExp('pat') (#6635) Preparation for #6141 `oxc_regular_expression` can already parse and validate both `/regexp-literal/` and `new RegExp("string-literal")`. But one thing that is not well-supported was reporting `Span` for the `RegExp("string-literal-with-\\escape")` case. For example, these two cases produce the same `RegExp` instances in JavaScript: - `/\d+/` - `new RegExp("\\d+")` For now, mainly in `oxc_linter`, the latter case is parsed with `oxc_parser` -> `ast::literal::StringLiteral` AST node -> `value` property. At this point, escape sequences are resolved(!), `oxc_regular_expression` can handle aligned `&str` as an argument without any problem in both cases. However, in terms of `Span` representation, these cases should be handled differently because of the `\\` in string literals... As a result, the parsed AST's `Span` for `new RegExp("string-literal")` is not accurate if it contains escape sequences. e.g. https://github.com/oxc-project/oxc/blob/a01a5dfdafb9cd536cb87867697e3ae43b1990e6/crates/oxc_linter/src/snapshots/no_invalid_regexp.snap#L118-L122 Each time the `\` appears, the subsequent position is shifted. `_` should be placed under `*` in this case. So... to resolve this issue, we need to implement `string_literal_parser` first, and use them as reading units of `oxc_regular_expression`.
oxc-project · Oct 21, 2024 · f8e1907 · f8e1907
1 parent 82bc745
commit f8e1907
Show file tree

Hide file tree

Showing 25 changed files with 4,172 additions and 2,879 deletions.
diff --git a/crates/oxc_regular_expression/examples/parse_literal.rs b/crates/oxc_regular_expression/examples/parse_literal.rs
@@ -1,12 +1,12 @@
 #![allow(clippy::print_stdout)]
 
 use oxc_allocator::Allocator;
-use oxc_regular_expression::{Parser, ParserOptions};
+use oxc_regular_expression::{LiteralParser, Options};
 
 fn main() {
     let allocator = Allocator::default();
 
-    for (pattern, flags) in [
+    for (pattern_text, flags_text) in [
         (r"ab", ""),
         (r"abc", "i"),
         (r"abcd", "igv"),
@@ -43,14 +43,16 @@ fn main() {
         (r"[\bb]", ""),
         (r"a{2,1}", "v"), // Error
     ] {
-        let parser = Parser::new(
+        let parser = LiteralParser::new(
             &allocator,
-            pattern,
-            ParserOptions::default().with_span_offset(1).with_flags(flags),
+            pattern_text,
+            Some(flags_text),
+            // +1 for added `/` in error reports
+            Options { pattern_span_offset: 1, ..Options::default() },
         );
         let ret = parser.parse();
 
-        let literal = format!("/{pattern}/{flags}");
+        let literal = format!("/{pattern_text}/{flags_text}");
         println!("Parse: {literal}");
         match ret {
             Ok(pattern) => {

diff --git a/crates/oxc_regular_expression/examples/regex_visitor.rs b/crates/oxc_regular_expression/examples/regex_visitor.rs
@@ -3,7 +3,7 @@
 use oxc_allocator::Allocator;
 use oxc_regular_expression::{
     visit::{RegExpAstKind, Visit},
-    Parser, ParserOptions,
+    LiteralParser, Options,
 };
 use oxc_span::GetSpan;
 
@@ -23,7 +23,7 @@ fn main() {
     let source_text = r"(https?:\/\/github\.com\/(([^\s]+)\/([^\s]+))\/([^\s]+\/)?(issues|pull)\/([0-9]+))|(([^\s]+)\/([^\s]+))?#([1-9][0-9]*)($|[\s\:\;\-\(\=])";
 
     let allocator = Allocator::default();
-    let parser = Parser::new(&allocator, source_text, ParserOptions::default());
+    let parser = LiteralParser::new(&allocator, source_text, None, Options::default());
     let pattern = parser.parse().unwrap();
 
     let mut visitor = TestVisitor;

diff --git a/crates/oxc_regular_expression/src/ast_impl/display.rs b/crates/oxc_regular_expression/src/ast_impl/display.rs
@@ -411,7 +411,7 @@ where
 
 #[cfg(test)]
 mod test {
-    use crate::{Parser, ParserOptions};
+    use crate::{LiteralParser, Options};
     use oxc_allocator::Allocator;
 
     type Case<'a> = (
@@ -557,13 +557,9 @@ mod test {
             let pattern = &input[left_slash + 1..right_slash];
             let flags = &input[right_slash + 1..];
 
-            let actual = Parser::new(
-                allocator,
-                pattern,
-                ParserOptions::default().with_span_offset(1).with_flags(flags),
-            )
-            .parse()
-            .unwrap();
+            let actual = LiteralParser::new(allocator, pattern, Some(flags), Options::default())
+                .parse()
+                .unwrap();
 
             let expect = output.unwrap_or(input);
             assert_eq!(expect, format!("/{actual}/{flags}")); // This uses `Display` impls

diff --git a/crates/oxc_regular_expression/src/diagnostics.rs b/crates/oxc_regular_expression/src/diagnostics.rs
@@ -3,6 +3,31 @@ use oxc_span::Span;
 
 const PREFIX: &str = "Invalid regular expression:";
 
+#[cold]
+pub fn invalid_input(span: Span) -> OxcDiagnostic {
+    OxcDiagnostic::error(format!("{PREFIX} Invalid input string literal")).with_label(span)
+}
+
+// ---
+
+#[cold]
+pub fn unknown_flag(span: Span, flag: &str) -> OxcDiagnostic {
+    OxcDiagnostic::error(format!("{PREFIX} Unknown flag: `{flag}` found")).with_label(span)
+}
+
+#[cold]
+pub fn duplicated_flags(span: Span, flag: &str) -> OxcDiagnostic {
+    OxcDiagnostic::error(format!("{PREFIX} Duplicated flag: `{flag}` found")).with_label(span)
+}
+
+#[cold]
+pub fn invalid_unicode_flags(span: Span) -> OxcDiagnostic {
+    OxcDiagnostic::error(format!("{PREFIX} Invalid unicode flags combination `u` and `v`"))
+        .with_label(span)
+}
+
+// ---
+
 #[cold]
 pub fn duplicated_capturing_group_names(spans: Vec<Span>) -> OxcDiagnostic {
     OxcDiagnostic::error(format!("{PREFIX} Duplicated capturing group names")).with_labels(spans)

diff --git a/crates/oxc_regular_expression/src/lib.rs b/crates/oxc_regular_expression/src/lib.rs
@@ -15,4 +15,92 @@ mod generated {
 }
 
 pub mod ast;
-pub use crate::{ast_impl::visit, options::ParserOptions, parser::Parser};
+pub use crate::{
+    ast_impl::visit,
+    options::Options,
+    parser::{ConstructorParser, LiteralParser},
+};
+
+// LEGACY APIS TO BE REMOVED SOON! ============================================
+
+#[derive(Clone, Copy, Debug, Default)]
+pub struct ParserOptions {
+    pub span_offset: u32,
+    pub unicode_mode: bool,
+    pub unicode_sets_mode: bool,
+    pub parse_string_literal: bool,
+}
+
+impl ParserOptions {
+    #[must_use]
+    pub fn with_span_offset(self, span_offset: u32) -> Self {
+        ParserOptions { span_offset, ..self }
+    }
+
+    #[must_use]
+    pub fn with_flags(self, flags: &str) -> Self {
+        let (mut unicode_mode, mut unicode_sets_mode) = (false, false);
+        for ch in flags.chars() {
+            if ch == 'u' {
+                unicode_mode = true;
+            }
+            if ch == 'v' {
+                unicode_mode = true;
+                unicode_sets_mode = true;
+            }
+        }
+
+        ParserOptions { unicode_mode, unicode_sets_mode, ..self }
+    }
+
+    #[must_use]
+    pub fn with_parse_string_literal(self) -> Self {
+        ParserOptions { parse_string_literal: true, ..self }
+    }
+}
+
+pub struct Parser<'a> {
+    allocator: &'a oxc_allocator::Allocator,
+    source_text: &'a str,
+    options: ParserOptions,
+}
+
+impl<'a> Parser<'a> {
+    pub fn new(
+        allocator: &'a oxc_allocator::Allocator,
+        source_text: &'a str,
+        options: ParserOptions,
+    ) -> Self {
+        Self { allocator, source_text, options }
+    }
+
+    pub fn parse(self) -> oxc_diagnostics::Result<crate::ast::Pattern<'a>> {
+        let ParserOptions { unicode_mode, unicode_sets_mode, span_offset, parse_string_literal } =
+            self.options;
+
+        let options = Options {
+            pattern_span_offset: span_offset,
+            flags_span_offset: 0, // Never be used
+        };
+
+        if parse_string_literal {
+            #[allow(clippy::match_same_arms)]
+            let flags_text = match (unicode_mode, unicode_sets_mode) {
+                (true, false) => Some("'u'"),
+                (false, true) => Some("'v'"),
+                (true, true) => Some("'v'"), // Do not validate this here
+                (false, false) => None,
+            };
+            ConstructorParser::new(self.allocator, self.source_text, flags_text, options).parse()
+        } else {
+            #[allow(clippy::match_same_arms)]
+            let flags_text = match (unicode_mode, unicode_sets_mode) {
+                (true, false) => Some("u"),
+                (false, true) => Some("v"),
+                (true, true) => Some("v"), // Do not validate this here
+                (false, false) => None,
+            };
+            LiteralParser::new(self.allocator, self.source_text, flags_text, options).parse()
+        }
+    }
+}
diff --git a/crates/oxc_regular_expression/src/options.rs b/crates/oxc_regular_expression/src/options.rs
@@ -1,33 +1,7 @@
 #[derive(Clone, Copy, Debug, Default)]
-pub struct ParserOptions {
-    /// Used to adjust Span positions to fit the global source code.
-    pub span_offset: u32,
-    /// Unicode mode(`u` or `v` flag) enabled or not.
-    pub unicode_mode: bool,
-    /// Extended Unicode mode(`v` flag) enabled or not.
-    pub unicode_sets_mode: bool,
-    // TODO: Add `handle_escape_with_quote_type` like option to support `new RegExp("with \"escape\"")`
-}
-
-impl ParserOptions {
-    #[must_use]
-    pub fn with_span_offset(self, span_offset: u32) -> Self {
-        ParserOptions { span_offset, ..self }
-    }
-
-    #[must_use]
-    pub fn with_flags(self, flags: &str) -> Self {
-        let (mut unicode_mode, mut unicode_sets_mode) = (false, false);
-        for ch in flags.chars() {
-            if ch == 'u' {
-                unicode_mode = true;
-            }
-            if ch == 'v' {
-                unicode_mode = true;
-                unicode_sets_mode = true;
-            }
-        }
-
-        ParserOptions { unicode_mode, unicode_sets_mode, ..self }
-    }
+pub struct Options {
+    /// Used to adjust `Span` positions to fit the global source code.
+    pub pattern_span_offset: u32,
+    /// Used to adjust `Span` positions to fit the global source code.
+    pub flags_span_offset: u32,
 }
diff --git a/crates/oxc_regular_expression/src/parser/flags_parser.rs b/crates/oxc_regular_expression/src/parser/flags_parser.rs
@@ -0,0 +1,119 @@
+use oxc_diagnostics::Result;
+use rustc_hash::FxHashSet;
+
+use crate::{
+    diagnostics,
+    parser::{reader::Reader, span_factory::SpanFactory},
+};
+
+pub struct FlagsParser<'a> {
+    reader: Reader<'a>,
+    span_factory: SpanFactory,
+}
+
+impl<'a> FlagsParser<'a> {
+    pub fn new(reader: Reader<'a>, span_offset: u32) -> Self {
+        Self { reader, span_factory: SpanFactory::new(span_offset) }
+    }
+
+    /// Returns: (is_unicode_mode, is_unicode_sets_mode)
+    pub fn parse(mut self) -> Result<(bool, bool)> {
+        let mut is_unicode_mode = false;
+        let mut is_unicode_sets_mode = false;
+        let mut unique_flags = FxHashSet::default();
+
+        while let Some(cp) = self.reader.peek() {
+            let span_start = self.reader.offset();
+            self.reader.advance();
+            let span_end = self.reader.offset();
+
+            if unique_flags.contains(&cp) {
+                return Err(diagnostics::duplicated_flags(
+                    self.span_factory.create(span_start, span_end),
+                    &self.reader.atom(span_start, span_end),
+                ));
+            }
+            if char::try_from(cp)
+                .map_or(true, |c| !matches!(c, 'd' | 'g' | 'i' | 'm' | 's' | 'u' | 'v' | 'y'))
+            {
+                return Err(diagnostics::unknown_flag(
+                    self.span_factory.create(span_start, span_end),
+                    &self.reader.atom(span_start, span_end),
+                ));
+            }
+
+            if cp == 'u' as u32 {
+                if unique_flags.contains(&('v' as u32)) {
+                    return Err(diagnostics::invalid_unicode_flags(
+                        self.span_factory.create(span_start, span_end),
+                    ));
+                }
+                is_unicode_mode = true;
+            }
+            if cp == 'v' as u32 {
+                if unique_flags.contains(&('u' as u32)) {
+                    return Err(diagnostics::invalid_unicode_flags(
+                        self.span_factory.create(span_start, span_end),
+                    ));
+                }
+                is_unicode_mode = true;
+                is_unicode_sets_mode = true;
+            }
+
+            unique_flags.insert(cp);
+        }
+
+        Ok((is_unicode_mode, is_unicode_sets_mode))
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn should_pass() {
+        for (flags_text, expected) in &[
+            ("", (false, false)),
+            ("i", (false, false)),
+            ("u", (true, false)),
+            ("v", (true, true)),
+            ("vg", (true, true)),
+        ] {
+            let reader = Reader::initialize(flags_text, true, false).unwrap();
+            let result = FlagsParser::new(reader, 0).parse().unwrap();
+            assert_eq!(result, *expected);
+        }
+    }
+
+    #[test]
+    fn should_fail() {
+        for flags_text in &["uv", "vu", "uu", "vv", "gg", "$"] {
+            let reader = Reader::initialize(flags_text, true, false).unwrap();
+            let err = FlagsParser::new(reader, 0).parse();
+            assert!(err.is_err());
+            // println!("{:?}", err.unwrap_err().with_source_code(*flags_text));
+        }
+        for flags_text in &[r#""uv""#, "'V'", "\"-\"", r#""\162""#] {
+            let reader = Reader::initialize(flags_text, true, true).unwrap();
+            let err = FlagsParser::new(reader, 0).parse();
+            assert!(err.is_err());
+            // println!("{:?}", err.unwrap_err().with_source_code(*flags_text));
+        }
+    }
+
+    #[test]
+    fn string_literal() {
+        for reader in [
+            Reader::initialize("u", true, false).unwrap(),
+            Reader::initialize("'u'", true, true).unwrap(),
+            Reader::initialize(r#""\165""#, true, true).unwrap(),
+            Reader::initialize(r#""\x75""#, true, true).unwrap(),
+            Reader::initialize(r#""\u0075""#, true, true).unwrap(),
+            Reader::initialize(r#""\u{0075}""#, true, true).unwrap(),
+        ] {
+            let result = FlagsParser::new(reader, 0).parse().unwrap();
+            assert_eq!(result, (true, false));
+        }
+    }
+}