oxc-project · graphite-app · Aug 23, 2024 · Aug 23, 2024
@@ -1,13 +1,15 @@
-#![allow(clippy::print_stdout)]
+#![allow(clippy::print_stdout, clippy::cast_possible_truncation)]
 use std::{env, fs, path::Path, sync::Arc};
 
 use oxc_allocator::Allocator;
-use oxc_ast::AstKind;
+use oxc_ast::{ast, AstKind};
 use oxc_parser::Parser;
+use oxc_regular_expression::{FlagsParser, ParserOptions, PatternParser};
 use oxc_semantic::SemanticBuilder;
 use oxc_span::SourceType;
 
 fn main() {
+    // 1. Get the file content and parse
     let name = env::args().nth(1).unwrap_or_else(|| "test.js".to_string());
     let path = Path::new(&name);
 
@@ -26,23 +28,31 @@ fn main() {
         return;
     }
 
+    // 2. Build the semantic to iteralate over the nodes
     let program = allocator.alloc(parser_ret.program);
     let semantic_ret = SemanticBuilder::new(&source_text, source_type).build(program);
     let semantic = semantic_ret.semantic;
 
+    // 3. Parse regular expressions
+    // - RegExpLiteral
+    // - new RegExp() with string or template literal if static
     for node in semantic.nodes().iter() {
         match node.kind() {
             AstKind::RegExpLiteral(re) => {
-                let literal = re.span.source_text(&source_text);
-                let parsed = oxc_regular_expression::Parser::new(
+                println!("🍀 {}", re.span.source_text(&source_text));
+
+                let parsed = PatternParser::new(
                     &allocator,
-                    literal,
-                    oxc_regular_expression::ParserOptions::default()
-                        .with_span_offset(re.span.start),
+                    re.regex.pattern.as_str(),
+                    ParserOptions {
+                        span_offset: re.span.start + 1,
+                        unicode_mode: re.regex.flags.contains(ast::RegExpFlags::U)
+                            || re.regex.flags.contains(ast::RegExpFlags::V),
+                        unicode_sets_mode: re.regex.flags.contains(ast::RegExpFlags::V),
+                    },
                 )
                 .parse();
 
-                println!("🍀 {literal}");
                 if let Err(error) = parsed {
                     let error = error.with_source_code(Arc::clone(&source_text));
                     println!("{error:?}");
@@ -51,18 +61,60 @@ fn main() {
                 println!("{parsed:#?}");
                 println!();
             }
-            AstKind::NewExpression(new_expr) => {
+            AstKind::NewExpression(new_expr)
                 if new_expr
                     .callee
                     .get_identifier_reference()
                     .filter(|ident| ident.name == "RegExp")
-                    .is_some()
-                {
-                    println!("👻 TODO: new RegExp(...)");
-                    println!();
+                    .is_some() =>
+            {
+                println!("🍀 {}", new_expr.span.source_text(&source_text));
+
+                let pattern = match new_expr.arguments.first() {
+                    Some(ast::Argument::StringLiteral(sl)) => &sl.value,
+                    Some(ast::Argument::TemplateLiteral(tl))
+                        if tl.is_no_substitution_template() =>
+                    {
+                        &tl.quasi().unwrap()
+                    }
+                    _ => {
+                        continue;
+                    }
+                };
+
+                let flags = match new_expr.arguments.get(1) {
+                    Some(ast::Argument::StringLiteral(sl)) => &sl.value,
+                    Some(ast::Argument::TemplateLiteral(tl))
+                        if tl.is_no_substitution_template() =>
+                    {
+                        &tl.quasi().unwrap()
+                    }
+                    _ => "",
+                };
+
+                let flags =
+                    FlagsParser::new(&allocator, flags, ParserOptions::default()).parse().unwrap();
+                let parsed = PatternParser::new(
+                    &allocator,
+                    pattern,
+                    ParserOptions {
+                        span_offset: new_expr.span.start + 12, // = "new RegExp(\"".len()
+                        unicode_mode: flags.unicode || flags.unicode_sets,
+                        unicode_sets_mode: flags.unicode_sets,
+                    },
+                )
+                .parse();
+
+                if let Err(error) = parsed {
+                    let error = error.with_source_code(Arc::clone(&source_text));
+                    println!("{error:?}");
+                    return;
                 }
+                println!("{parsed:#?}");
+                println!();
             }
             _ => {}
         }
     }
+    println!("✨ All parsed!");
 }
@@ -1,3 +1,7 @@
-const re1 = /abc{1}/gsv;
-const re2 = new RegExp("ooo", "u");
-const re3 = /[\w--[v]]/gsv;
+// All of them should be the same result!
+[
+  /\1(.)\\"'`a/v,
+  new RegExp("\\1(.)\\\\\"'`\a","v"),
+  new RegExp('\\1(.)\\\\"\'`\a','v'),
+  new RegExp(`\\1(.)\\\\"'\`\a`,`v`),
+]
@@ -3,19 +3,29 @@ pub struct Reader<'a> {
     unicode_mode: bool,
     /// Current index for `u8_units`(unicode mode) or `u16_units`(non-unicode mode).
     index: usize,
+    // NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary.
+    //
+    // If I understand correctly (and there are no unexpected factors),
+    // AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs.
+    //
+    // Therefore, performance might be improved by:
+    // - using only `u8_units`, and
+    // - checking if each unit (char) is non-BMP, and if so, converting it into a surrogate pair and emitting 2 units.
+    // However, I'm not certain this approach is faster than current one using `encode_utf16()` all at once.
+    /// Iteration units for unicode mode.
     /// Even in non-unicode mode, used for `Span` offset calculation.
     u8_units: Vec<(usize, char)>,
+    /// Iteration units for non-unicode mode.
     u16_units: Vec<u16>,
     /// Last offset caches for non-unicode mode.
     last_offset_indices: (usize, usize),
 }
 
 impl<'a> Reader<'a> {
     pub fn new(source: &'a str, unicode_mode: bool) -> Self {
-        // NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary.
-        // As as a parser, AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs.
         // NOTE: Collecting `Vec` may not be efficient if the source is too large.
         // Implements lookahead cache with `VecDeque` is better...?
+        // But when I tried once, there are no notable improvements.
         let u8_units = source.char_indices().collect::<Vec<_>>();
         let u16_units = if unicode_mode { "" } else { source }.encode_utf16().collect::<Vec<_>>();
 
@@ -26,6 +36,8 @@ impl<'a> Reader<'a> {
         if self.unicode_mode {
             self.u8_units.get(self.index).map_or(self.source.len(), |(idx, _)| *idx)
         } else {
+            // NOTE: This does not return valid `Span` offset for surrogate pairs.
+            // In the first place, there is no such thing as string slice corresponding to them...
             let (mut u16_idx, mut u8_idx) = self.last_offset_indices;
             for (idx, ch) in &self.u8_units[u8_idx..] {
                 if self.index <= u16_idx {