Skip to content

Commit

Permalink
feat(regular_expression): Intro ConstructorParser(and `LiteralParse…
Browse files Browse the repository at this point in the history
…r`) to handle escape sequence in RegExp('pat') (#6635)

Preparation for #6141

`oxc_regular_expression` can already parse and validate both `/regexp-literal/` and `new RegExp("string-literal")`.

But one thing that is not well-supported was reporting `Span` for the `RegExp("string-literal-with-\\escape")` case.

For example, these two cases produce the same `RegExp` instances in JavaScript:

- `/\d+/`
- `new RegExp("\\d+")`

For now, mainly in `oxc_linter`, the latter case is parsed with `oxc_parser` -> `ast::literal::StringLiteral` AST node -> `value` property.

At this point, escape sequences are resolved(!), `oxc_regular_expression` can handle aligned `&str` as an argument without any problem in both cases.

However, in terms of `Span` representation, these cases should be handled differently because of the `\\` in string literals...

As a result, the parsed AST's `Span` for `new RegExp("string-literal")` is not accurate if it contains escape sequences.

e.g. https://github.com/oxc-project/oxc/blob/a01a5dfdafb9cd536cb87867697e3ae43b1990e6/crates/oxc_linter/src/snapshots/no_invalid_regexp.snap#L118-L122

Each time the `\` appears, the subsequent position is shifted. `_` should be placed under `*` in this case.

So... to resolve this issue, we need to implement `string_literal_parser` first, and use them as reading units of `oxc_regular_expression`.
  • Loading branch information
leaysgur committed Oct 21, 2024
1 parent 82bc745 commit f8e1907
Show file tree
Hide file tree
Showing 25 changed files with 4,172 additions and 2,879 deletions.
14 changes: 8 additions & 6 deletions crates/oxc_regular_expression/examples/parse_literal.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#![allow(clippy::print_stdout)]

use oxc_allocator::Allocator;
use oxc_regular_expression::{Parser, ParserOptions};
use oxc_regular_expression::{LiteralParser, Options};

fn main() {
let allocator = Allocator::default();

for (pattern, flags) in [
for (pattern_text, flags_text) in [
(r"ab", ""),
(r"abc", "i"),
(r"abcd", "igv"),
Expand Down Expand Up @@ -43,14 +43,16 @@ fn main() {
(r"[\bb]", ""),
(r"a{2,1}", "v"), // Error
] {
let parser = Parser::new(
let parser = LiteralParser::new(
&allocator,
pattern,
ParserOptions::default().with_span_offset(1).with_flags(flags),
pattern_text,
Some(flags_text),
// +1 for added `/` in error reports
Options { pattern_span_offset: 1, ..Options::default() },
);
let ret = parser.parse();

let literal = format!("/{pattern}/{flags}");
let literal = format!("/{pattern_text}/{flags_text}");
println!("Parse: {literal}");
match ret {
Ok(pattern) => {
Expand Down
4 changes: 2 additions & 2 deletions crates/oxc_regular_expression/examples/regex_visitor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use oxc_allocator::Allocator;
use oxc_regular_expression::{
visit::{RegExpAstKind, Visit},
Parser, ParserOptions,
LiteralParser, Options,
};
use oxc_span::GetSpan;

Expand All @@ -23,7 +23,7 @@ fn main() {
let source_text = r"(https?:\/\/github\.com\/(([^\s]+)\/([^\s]+))\/([^\s]+\/)?(issues|pull)\/([0-9]+))|(([^\s]+)\/([^\s]+))?#([1-9][0-9]*)($|[\s\:\;\-\(\=])";

let allocator = Allocator::default();
let parser = Parser::new(&allocator, source_text, ParserOptions::default());
let parser = LiteralParser::new(&allocator, source_text, None, Options::default());
let pattern = parser.parse().unwrap();

let mut visitor = TestVisitor;
Expand Down
12 changes: 4 additions & 8 deletions crates/oxc_regular_expression/src/ast_impl/display.rs
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ where

#[cfg(test)]
mod test {
use crate::{Parser, ParserOptions};
use crate::{LiteralParser, Options};
use oxc_allocator::Allocator;

type Case<'a> = (
Expand Down Expand Up @@ -557,13 +557,9 @@ mod test {
let pattern = &input[left_slash + 1..right_slash];
let flags = &input[right_slash + 1..];

let actual = Parser::new(
allocator,
pattern,
ParserOptions::default().with_span_offset(1).with_flags(flags),
)
.parse()
.unwrap();
let actual = LiteralParser::new(allocator, pattern, Some(flags), Options::default())
.parse()
.unwrap();

let expect = output.unwrap_or(input);
assert_eq!(expect, format!("/{actual}/{flags}")); // This uses `Display` impls
Expand Down
25 changes: 25 additions & 0 deletions crates/oxc_regular_expression/src/diagnostics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,31 @@ use oxc_span::Span;

const PREFIX: &str = "Invalid regular expression:";

#[cold]
pub fn invalid_input(span: Span) -> OxcDiagnostic {
OxcDiagnostic::error(format!("{PREFIX} Invalid input string literal")).with_label(span)
}

// ---

#[cold]
pub fn unknown_flag(span: Span, flag: &str) -> OxcDiagnostic {
OxcDiagnostic::error(format!("{PREFIX} Unknown flag: `{flag}` found")).with_label(span)
}

#[cold]
pub fn duplicated_flags(span: Span, flag: &str) -> OxcDiagnostic {
OxcDiagnostic::error(format!("{PREFIX} Duplicated flag: `{flag}` found")).with_label(span)
}

#[cold]
pub fn invalid_unicode_flags(span: Span) -> OxcDiagnostic {
OxcDiagnostic::error(format!("{PREFIX} Invalid unicode flags combination `u` and `v`"))
.with_label(span)
}

// ---

#[cold]
pub fn duplicated_capturing_group_names(spans: Vec<Span>) -> OxcDiagnostic {
OxcDiagnostic::error(format!("{PREFIX} Duplicated capturing group names")).with_labels(spans)
Expand Down
90 changes: 89 additions & 1 deletion crates/oxc_regular_expression/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,92 @@ mod generated {
}

pub mod ast;
pub use crate::{ast_impl::visit, options::ParserOptions, parser::Parser};
pub use crate::{
ast_impl::visit,
options::Options,
parser::{ConstructorParser, LiteralParser},
};

// LEGACY APIS TO BE REMOVED SOON! ============================================

#[derive(Clone, Copy, Debug, Default)]
pub struct ParserOptions {
pub span_offset: u32,
pub unicode_mode: bool,
pub unicode_sets_mode: bool,
pub parse_string_literal: bool,
}

impl ParserOptions {
#[must_use]
pub fn with_span_offset(self, span_offset: u32) -> Self {
ParserOptions { span_offset, ..self }
}

#[must_use]
pub fn with_flags(self, flags: &str) -> Self {
let (mut unicode_mode, mut unicode_sets_mode) = (false, false);
for ch in flags.chars() {
if ch == 'u' {
unicode_mode = true;
}
if ch == 'v' {
unicode_mode = true;
unicode_sets_mode = true;
}
}

ParserOptions { unicode_mode, unicode_sets_mode, ..self }
}

#[must_use]
pub fn with_parse_string_literal(self) -> Self {
ParserOptions { parse_string_literal: true, ..self }
}
}

pub struct Parser<'a> {
allocator: &'a oxc_allocator::Allocator,
source_text: &'a str,
options: ParserOptions,
}

impl<'a> Parser<'a> {
pub fn new(
allocator: &'a oxc_allocator::Allocator,
source_text: &'a str,
options: ParserOptions,
) -> Self {
Self { allocator, source_text, options }
}

pub fn parse(self) -> oxc_diagnostics::Result<crate::ast::Pattern<'a>> {
let ParserOptions { unicode_mode, unicode_sets_mode, span_offset, parse_string_literal } =
self.options;

let options = Options {
pattern_span_offset: span_offset,
flags_span_offset: 0, // Never be used
};

if parse_string_literal {
#[allow(clippy::match_same_arms)]
let flags_text = match (unicode_mode, unicode_sets_mode) {
(true, false) => Some("'u'"),
(false, true) => Some("'v'"),
(true, true) => Some("'v'"), // Do not validate this here
(false, false) => None,
};
ConstructorParser::new(self.allocator, self.source_text, flags_text, options).parse()
} else {
#[allow(clippy::match_same_arms)]
let flags_text = match (unicode_mode, unicode_sets_mode) {
(true, false) => Some("u"),
(false, true) => Some("v"),
(true, true) => Some("v"), // Do not validate this here
(false, false) => None,
};
LiteralParser::new(self.allocator, self.source_text, flags_text, options).parse()
}
}
}
36 changes: 5 additions & 31 deletions crates/oxc_regular_expression/src/options.rs
Original file line number Diff line number Diff line change
@@ -1,33 +1,7 @@
#[derive(Clone, Copy, Debug, Default)]
pub struct ParserOptions {
/// Used to adjust Span positions to fit the global source code.
pub span_offset: u32,
/// Unicode mode(`u` or `v` flag) enabled or not.
pub unicode_mode: bool,
/// Extended Unicode mode(`v` flag) enabled or not.
pub unicode_sets_mode: bool,
// TODO: Add `handle_escape_with_quote_type` like option to support `new RegExp("with \"escape\"")`
}

impl ParserOptions {
#[must_use]
pub fn with_span_offset(self, span_offset: u32) -> Self {
ParserOptions { span_offset, ..self }
}

#[must_use]
pub fn with_flags(self, flags: &str) -> Self {
let (mut unicode_mode, mut unicode_sets_mode) = (false, false);
for ch in flags.chars() {
if ch == 'u' {
unicode_mode = true;
}
if ch == 'v' {
unicode_mode = true;
unicode_sets_mode = true;
}
}

ParserOptions { unicode_mode, unicode_sets_mode, ..self }
}
pub struct Options {
/// Used to adjust `Span` positions to fit the global source code.
pub pattern_span_offset: u32,
/// Used to adjust `Span` positions to fit the global source code.
pub flags_span_offset: u32,
}
119 changes: 119 additions & 0 deletions crates/oxc_regular_expression/src/parser/flags_parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
use oxc_diagnostics::Result;
use rustc_hash::FxHashSet;

use crate::{
diagnostics,
parser::{reader::Reader, span_factory::SpanFactory},
};

pub struct FlagsParser<'a> {
reader: Reader<'a>,
span_factory: SpanFactory,
}

impl<'a> FlagsParser<'a> {
pub fn new(reader: Reader<'a>, span_offset: u32) -> Self {
Self { reader, span_factory: SpanFactory::new(span_offset) }
}

/// Returns: (is_unicode_mode, is_unicode_sets_mode)
pub fn parse(mut self) -> Result<(bool, bool)> {
let mut is_unicode_mode = false;
let mut is_unicode_sets_mode = false;
let mut unique_flags = FxHashSet::default();

while let Some(cp) = self.reader.peek() {
let span_start = self.reader.offset();
self.reader.advance();
let span_end = self.reader.offset();

if unique_flags.contains(&cp) {
return Err(diagnostics::duplicated_flags(
self.span_factory.create(span_start, span_end),
&self.reader.atom(span_start, span_end),
));
}
if char::try_from(cp)
.map_or(true, |c| !matches!(c, 'd' | 'g' | 'i' | 'm' | 's' | 'u' | 'v' | 'y'))
{
return Err(diagnostics::unknown_flag(
self.span_factory.create(span_start, span_end),
&self.reader.atom(span_start, span_end),
));
}

if cp == 'u' as u32 {
if unique_flags.contains(&('v' as u32)) {
return Err(diagnostics::invalid_unicode_flags(
self.span_factory.create(span_start, span_end),
));
}
is_unicode_mode = true;
}
if cp == 'v' as u32 {
if unique_flags.contains(&('u' as u32)) {
return Err(diagnostics::invalid_unicode_flags(
self.span_factory.create(span_start, span_end),
));
}
is_unicode_mode = true;
is_unicode_sets_mode = true;
}

unique_flags.insert(cp);
}

Ok((is_unicode_mode, is_unicode_sets_mode))
}
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn should_pass() {
for (flags_text, expected) in &[
("", (false, false)),
("i", (false, false)),
("u", (true, false)),
("v", (true, true)),
("vg", (true, true)),
] {
let reader = Reader::initialize(flags_text, true, false).unwrap();
let result = FlagsParser::new(reader, 0).parse().unwrap();
assert_eq!(result, *expected);
}
}

#[test]
fn should_fail() {
for flags_text in &["uv", "vu", "uu", "vv", "gg", "$"] {
let reader = Reader::initialize(flags_text, true, false).unwrap();
let err = FlagsParser::new(reader, 0).parse();
assert!(err.is_err());
// println!("{:?}", err.unwrap_err().with_source_code(*flags_text));
}
for flags_text in &[r#""uv""#, "'V'", "\"-\"", r#""\162""#] {
let reader = Reader::initialize(flags_text, true, true).unwrap();
let err = FlagsParser::new(reader, 0).parse();
assert!(err.is_err());
// println!("{:?}", err.unwrap_err().with_source_code(*flags_text));
}
}

#[test]
fn string_literal() {
for reader in [
Reader::initialize("u", true, false).unwrap(),
Reader::initialize("'u'", true, true).unwrap(),
Reader::initialize(r#""\165""#, true, true).unwrap(),
Reader::initialize(r#""\x75""#, true, true).unwrap(),
Reader::initialize(r#""\u0075""#, true, true).unwrap(),
Reader::initialize(r#""\u{0075}""#, true, true).unwrap(),
] {
let result = FlagsParser::new(reader, 0).parse().unwrap();
assert_eq!(result, (true, false));
}
}
}
Loading

0 comments on commit f8e1907

Please sign in to comment.