Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(regular_expression): Update example to support RegExp constructor #5106

Merged
merged 1 commit into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 65 additions & 13 deletions crates/oxc_regular_expression/examples/parse_file.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
#![allow(clippy::print_stdout)]
#![allow(clippy::print_stdout, clippy::cast_possible_truncation)]
use std::{env, fs, path::Path, sync::Arc};

use oxc_allocator::Allocator;
use oxc_ast::AstKind;
use oxc_ast::{ast, AstKind};
use oxc_parser::Parser;
use oxc_regular_expression::{FlagsParser, ParserOptions, PatternParser};
use oxc_semantic::SemanticBuilder;
use oxc_span::SourceType;

fn main() {
// 1. Get the file content and parse
let name = env::args().nth(1).unwrap_or_else(|| "test.js".to_string());
let path = Path::new(&name);

Expand All @@ -26,23 +28,31 @@ fn main() {
return;
}

// 2. Build the semantic to iteralate over the nodes
let program = allocator.alloc(parser_ret.program);
let semantic_ret = SemanticBuilder::new(&source_text, source_type).build(program);
let semantic = semantic_ret.semantic;

// 3. Parse regular expressions
// - RegExpLiteral
// - new RegExp() with string or template literal if static
for node in semantic.nodes().iter() {
match node.kind() {
AstKind::RegExpLiteral(re) => {
let literal = re.span.source_text(&source_text);
let parsed = oxc_regular_expression::Parser::new(
println!("🍀 {}", re.span.source_text(&source_text));

let parsed = PatternParser::new(
&allocator,
literal,
oxc_regular_expression::ParserOptions::default()
.with_span_offset(re.span.start),
re.regex.pattern.as_str(),
ParserOptions {
span_offset: re.span.start + 1,
unicode_mode: re.regex.flags.contains(ast::RegExpFlags::U)
|| re.regex.flags.contains(ast::RegExpFlags::V),
unicode_sets_mode: re.regex.flags.contains(ast::RegExpFlags::V),
},
)
.parse();

println!("🍀 {literal}");
if let Err(error) = parsed {
let error = error.with_source_code(Arc::clone(&source_text));
println!("{error:?}");
Expand All @@ -51,18 +61,60 @@ fn main() {
println!("{parsed:#?}");
println!();
}
AstKind::NewExpression(new_expr) => {
AstKind::NewExpression(new_expr)
if new_expr
.callee
.get_identifier_reference()
.filter(|ident| ident.name == "RegExp")
.is_some()
{
println!("👻 TODO: new RegExp(...)");
println!();
.is_some() =>
{
println!("🍀 {}", new_expr.span.source_text(&source_text));

let pattern = match new_expr.arguments.first() {
Some(ast::Argument::StringLiteral(sl)) => &sl.value,
Some(ast::Argument::TemplateLiteral(tl))
if tl.is_no_substitution_template() =>
{
&tl.quasi().unwrap()
}
_ => {
continue;
}
};

let flags = match new_expr.arguments.get(1) {
Some(ast::Argument::StringLiteral(sl)) => &sl.value,
Some(ast::Argument::TemplateLiteral(tl))
if tl.is_no_substitution_template() =>
{
&tl.quasi().unwrap()
}
_ => "",
};

let flags =
FlagsParser::new(&allocator, flags, ParserOptions::default()).parse().unwrap();
let parsed = PatternParser::new(
&allocator,
pattern,
ParserOptions {
span_offset: new_expr.span.start + 12, // = "new RegExp(\"".len()
unicode_mode: flags.unicode || flags.unicode_sets,
unicode_sets_mode: flags.unicode_sets,
},
)
.parse();

if let Err(error) = parsed {
let error = error.with_source_code(Arc::clone(&source_text));
println!("{error:?}");
return;
}
println!("{parsed:#?}");
println!();
}
_ => {}
}
}
println!("✨ All parsed!");
}
10 changes: 7 additions & 3 deletions crates/oxc_regular_expression/examples/test.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
const re1 = /abc{1}/gsv;
const re2 = new RegExp("ooo", "u");
const re3 = /[\w--[v]]/gsv;
// All of them should be the same result!
[
/\1(.)\\"'`a/v,
new RegExp("\\1(.)\\\\\"'`\a","v"),
new RegExp('\\1(.)\\\\"\'`\a','v'),
new RegExp(`\\1(.)\\\\"'\`\a`,`v`),
]
16 changes: 14 additions & 2 deletions crates/oxc_regular_expression/src/body_parser/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,29 @@ pub struct Reader<'a> {
unicode_mode: bool,
/// Current index for `u8_units`(unicode mode) or `u16_units`(non-unicode mode).
index: usize,
// NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary.
//
// If I understand correctly (and there are no unexpected factors),
// AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs.
//
// Therefore, performance might be improved by:
// - using only `u8_units`, and
// - checking if each unit (char) is non-BMP, and if so, converting it into a surrogate pair and emitting 2 units.
// However, I'm not certain this approach is faster than current one using `encode_utf16()` all at once.
/// Iteration units for unicode mode.
/// Even in non-unicode mode, used for `Span` offset calculation.
u8_units: Vec<(usize, char)>,
/// Iteration units for non-unicode mode.
u16_units: Vec<u16>,
/// Last offset caches for non-unicode mode.
last_offset_indices: (usize, usize),
}

impl<'a> Reader<'a> {
pub fn new(source: &'a str, unicode_mode: bool) -> Self {
// NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary.
// As as a parser, AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs.
// NOTE: Collecting `Vec` may not be efficient if the source is too large.
// Implements lookahead cache with `VecDeque` is better...?
// But when I tried once, there are no notable improvements.
let u8_units = source.char_indices().collect::<Vec<_>>();
let u16_units = if unicode_mode { "" } else { source }.encode_utf16().collect::<Vec<_>>();

Expand All @@ -26,6 +36,8 @@ impl<'a> Reader<'a> {
if self.unicode_mode {
self.u8_units.get(self.index).map_or(self.source.len(), |(idx, _)| *idx)
} else {
// NOTE: This does not return valid `Span` offset for surrogate pairs.
// In the first place, there is no such thing as string slice corresponding to them...
let (mut u16_idx, mut u8_idx) = self.last_offset_indices;
for (idx, ch) in &self.u8_units[u8_idx..] {
if self.index <= u16_idx {
Expand Down