This repository has been archived by the owner on Aug 31, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 656
feat(rome_js_analyze): noControlCharactersInRegex #4656
Merged
Changes from 2 commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
274 changes: 274 additions & 0 deletions
274
crates/rome_js_analyze/src/analyzers/nursery/no_control_characters_in_regex.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,274 @@ | ||
use crate::utils::escape_string; | ||
use rome_analyze::{context::RuleContext, declare_rule, Ast, Rule, RuleDiagnostic}; | ||
use rome_console::markup; | ||
use rome_js_syntax::{ | ||
AnyJsExpression, JsCallArguments, JsCallExpression, JsNewExpression, JsRegexLiteralExpression, | ||
JsStringLiteralExpression, | ||
}; | ||
use rome_rowan::{declare_node_union, AstNode, AstSeparatedList}; | ||
use std::{iter::Peekable, str::Chars}; | ||
|
||
declare_rule! { | ||
/// Prevents from having control characters and some escape sequences that match control characters in regular expressions. | ||
/// | ||
/// Control characters are hidden special characters that are numbered from 0 to 31 in the ASCII system. | ||
/// They're not commonly used in JavaScript text. So, if you see them in a pattern (called a regular expression), it's probably a mistake. | ||
/// | ||
/// The following elements of regular expression patterns are considered possible errors in typing and are therefore disallowed by this rule: | ||
/// | ||
/// - Hexadecimal character escapes from `\x00` to `\x1F` | ||
/// - Unicode character escapes from `\u0000` to `\u001F` | ||
/// - Unicode code point escapes from `\u{0}` to `\u{1F}` | ||
/// - Unescaped raw characters from U+0000 to U+001F | ||
/// | ||
/// Control escapes such as `\t` and `\n` are allowed by this rule. | ||
/// | ||
/// Source: https://eslint.org/docs/latest/rules/no-control-regex | ||
/// | ||
/// ## Examples | ||
/// | ||
/// ### Invalid | ||
/// ```js,expect_diagnostic | ||
/// var pattern1 = /\x00/; | ||
/// ``` | ||
/// ```js,expect_diagnostic | ||
/// var pattern2 = /\x0C/; | ||
/// ``` | ||
/// ```js,expect_diagnostic | ||
/// var pattern3 = /\x1F/; | ||
/// ``` | ||
/// ```js,expect_diagnostic | ||
/// var pattern4 = /\u000C/; | ||
/// ``` | ||
/// ```js,expect_diagnostic | ||
/// var pattern5 = /\u{C}/u; | ||
/// ``` | ||
/// ```js,expect_diagnostic | ||
/// var pattern7 = new RegExp("\x0C"); | ||
/// ``` | ||
/// ```js,expect_diagnostic | ||
/// var pattern7 = new RegExp("\\x0C"); | ||
/// ``` | ||
/// | ||
/// ### Valid | ||
/// ```js | ||
/// var pattern1 = /\x20/; | ||
/// var pattern2 = /\u0020/; | ||
/// var pattern3 = /\u{20}/u; | ||
/// var pattern4 = /\t/; | ||
/// var pattern5 = /\n/; | ||
/// var pattern6 = new RegExp("\x20"); | ||
/// ``` | ||
/// | ||
pub(crate) NoControlCharactersInRegex { | ||
version: "next", | ||
name: "noControlCharactersInRegex", | ||
recommended: true, | ||
} | ||
} | ||
|
||
declare_node_union! { | ||
pub(crate) RegexExpressionLike = JsNewExpression | JsCallExpression | JsRegexLiteralExpression | ||
} | ||
|
||
fn decode_hex_character_to_code_point(iter: &mut Peekable<Chars>) -> Option<(String, i64)> { | ||
let first = iter.next()?; | ||
let second = iter.next()?; | ||
let digits = format!("{first}{second}"); | ||
let code_point = i64::from_str_radix(&digits, 16).ok()?; | ||
Some((digits, code_point)) | ||
} | ||
|
||
fn decode_unicode_escape_to_code_point(iter: &mut Peekable<Chars>) -> Option<(String, i64)> { | ||
let mut digits = String::new(); | ||
// Loop 4 times as unicode escape sequence has exactly 4 hexadecimal digits | ||
for _ in 0..4 { | ||
if let Some(&c) = iter.peek() { | ||
match c { | ||
'0'..='9' | 'a'..='f' | 'A'..='F' => digits.push(iter.next()?), | ||
_ => continue, | ||
} | ||
} | ||
} | ||
let code_point = i64::from_str_radix(digits.as_str(), 16).ok()?; | ||
Some((digits, code_point)) | ||
} | ||
|
||
fn decode_escaped_code_point_to_code_point(iter: &mut Peekable<Chars>) -> Option<(String, i64)> { | ||
let mut digits = String::new(); | ||
if iter.peek() == Some(&'{') { | ||
iter.next(); | ||
while let Some(&c) = iter.peek() { | ||
if c == '}' { | ||
iter.next(); | ||
let code_point = i64::from_str_radix(&digits, 16).ok()?; | ||
return Some((format!("{{{}}}", digits), code_point)); | ||
} else { | ||
digits.push(iter.next()?); | ||
} | ||
} | ||
} | ||
None | ||
} | ||
|
||
fn add_control_character_to_vec( | ||
prefix: &str, | ||
iter: &mut Peekable<Chars>, | ||
control_characters: &mut Vec<String>, | ||
decode: fn(&mut Peekable<Chars>) -> Option<(String, i64)>, | ||
) { | ||
if let Some((s, code_point)) = decode(iter) { | ||
// ASCII control characters are represented by code points from 0 to 31 | ||
if (0..=31).contains(&code_point) { | ||
control_characters.push(format!("{prefix}{s}")); | ||
} | ||
} | ||
} | ||
|
||
/// Collecting control characters for regex. The following characters in regular expression patterns are considered as control characters: | ||
/// - Hexadecimal character escapes from `\x00` to `\x1F`. | ||
/// - Unicode character escapes from `\u0000` to `\u001F`. | ||
/// - Unicode code point escapes range from `\u{0}` to `\u{1F}`. | ||
/// - The Unicode flag must be set as true in order for these Unicode code point escapes to work: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode. | ||
/// - Unescaped raw characters from U+0000 to U+001F. | ||
fn collect_control_characters(pattern: String, flags: Option<String>) -> Option<Vec<String>> { | ||
let mut control_characters: Vec<String> = Vec::new(); | ||
let is_unicode_flag_set = flags.unwrap_or_default().contains('u'); | ||
let mut iter = pattern.chars().peekable(); | ||
|
||
while let Some(c) = iter.next() { | ||
match c { | ||
'\\' => match iter.next() { | ||
Some('x') => add_control_character_to_vec( | ||
"\\x", | ||
&mut iter, | ||
&mut control_characters, | ||
decode_hex_character_to_code_point, | ||
), | ||
Some('u') if is_unicode_flag_set => add_control_character_to_vec( | ||
"\\u", | ||
&mut iter, | ||
&mut control_characters, | ||
decode_escaped_code_point_to_code_point, | ||
), | ||
Some('u') => add_control_character_to_vec( | ||
"\\u", | ||
&mut iter, | ||
&mut control_characters, | ||
decode_unicode_escape_to_code_point, | ||
), | ||
Some('\\') => continue, | ||
_ => break, | ||
}, | ||
_ => continue, | ||
} | ||
} | ||
|
||
if !control_characters.is_empty() { | ||
Some(control_characters) | ||
} else { | ||
None | ||
} | ||
} | ||
|
||
fn collect_control_characters_from_expression( | ||
callee: &AnyJsExpression, | ||
js_call_arguments: &JsCallArguments, | ||
) -> Option<Vec<String>> { | ||
let js_identifier = match callee { | ||
AnyJsExpression::JsIdentifierExpression(js_identifier) => js_identifier, | ||
_ => return None, | ||
}; | ||
|
||
if js_identifier.name().ok()?.has_name("RegExp") { | ||
let mut args = js_call_arguments.args().iter(); | ||
let raw_pattern = args | ||
.next() | ||
.and_then(|arg| arg.ok()) | ||
.and_then(|arg| JsStringLiteralExpression::cast_ref(arg.syntax())) | ||
.and_then(|js_string_literal| js_string_literal.inner_string_text().ok())? | ||
.to_string(); | ||
|
||
let pattern = escape_string(&raw_pattern).unwrap_or(raw_pattern); | ||
|
||
let regexp_flags = args | ||
.next() | ||
.and_then(|arg| arg.ok()) | ||
.and_then(|arg| JsStringLiteralExpression::cast_ref(arg.syntax())) | ||
.map(|js_string_literal| js_string_literal.text()); | ||
|
||
return collect_control_characters(pattern, regexp_flags); | ||
} | ||
None | ||
} | ||
|
||
impl Rule for NoControlCharactersInRegex { | ||
type Query = Ast<RegexExpressionLike>; | ||
type State = Vec<String>; | ||
type Signals = Option<Self::State>; | ||
type Options = (); | ||
|
||
fn run(ctx: &RuleContext<Self>) -> Self::Signals { | ||
let node = ctx.query(); | ||
match node { | ||
RegexExpressionLike::JsNewExpression(js_new_expression) => { | ||
collect_control_characters_from_expression( | ||
&js_new_expression.callee().ok()?, | ||
&js_new_expression.arguments()?, | ||
) | ||
} | ||
RegexExpressionLike::JsCallExpression(js_call_expression) => { | ||
collect_control_characters_from_expression( | ||
&js_call_expression.callee().ok()?, | ||
&js_call_expression.arguments().ok()?, | ||
) | ||
} | ||
RegexExpressionLike::JsRegexLiteralExpression(js_regex_literal_expression) => { | ||
collect_control_characters( | ||
js_regex_literal_expression.pattern().ok()?, | ||
js_regex_literal_expression.flags().ok(), | ||
) | ||
} | ||
} | ||
} | ||
|
||
fn diagnostic(ctx: &RuleContext<Self>, state: &Self::State) -> Option<RuleDiagnostic> { | ||
Some(RuleDiagnostic::new( | ||
rule_category!(), | ||
ctx.query().range(), | ||
markup! { | ||
"Unexpected control character(s) in regular expression: "<Emphasis>{state.join(", ")}</Emphasis>"" | ||
}, | ||
Comment on lines
+239
to
+241
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should let the user know what they should do here. For example, I see the diagnostic for the first time, and I don't know how to solve the issue 😄 |
||
).note( | ||
markup! { | ||
"Control characters are unusual and potentially incorrect inputs, so they are disallowed." | ||
} | ||
)) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn test_collect_control_characters() { | ||
assert_eq!( | ||
collect_control_characters(String::from("\\x00\\x0F\\u0010\\u001F"), None), | ||
Some(vec![ | ||
String::from("\\x00"), | ||
String::from("\\x0F"), | ||
String::from("\\u0010"), | ||
String::from("\\u001F") | ||
]) | ||
); | ||
assert_eq!( | ||
collect_control_characters(String::from("\\u{0}\\u{1F}"), Some(String::from("u"))), | ||
Some(vec![String::from("\\u{0}"), String::from("\\u{1F}")]) | ||
); | ||
assert_eq!( | ||
collect_control_characters(String::from("\\x20\\u0020\\u{20}\\t\\n"), None), | ||
None | ||
); | ||
} | ||
} |
25 changes: 25 additions & 0 deletions
25
crates/rome_js_analyze/tests/specs/nursery/noControlCharactersInRegex/invalid.js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
var regex = RegExp("\\x1f"); | ||
var regex = RegExp("\\u{1111}*\\x1F", "u"); | ||
var regex = new RegExp("\\x1f\\x1e"); | ||
var regex = new RegExp("\\x1fFOO\\x00"); | ||
var regex = new RegExp("FOO\\x1fFOO\\x1f"); | ||
var regex = new RegExp("\\x1f"); | ||
var regex = new RegExp("\\u001F", flags); | ||
var regex = new RegExp("\\u{1111}*\\x1F", "u"); | ||
var regex = new RegExp("\\u{1F}", "u"); | ||
var regex = new RegExp("\\u{1F}", "gui"); | ||
var regex = new RegExp("\\x0C"); | ||
var regex = new RegExp("\x0C"); | ||
var regex = /\x00/; | ||
var regex = /\x0C/; | ||
var regex = /\x1F/; | ||
var regex = /\u000C/; | ||
var regex = /\u{C}/u; | ||
var regex = /\\\x1f\\x1e/; | ||
var regex = /\\\x1fFOO\\x00/; | ||
var regex = /FOO\\\x1fFOO\\x1f/; | ||
var regex = /(?<a>\\x1f)/; | ||
var regex = /(?<\u{1d49c}>.)\x1f/; | ||
var regex = /\u{1111}*\x1F/u; | ||
var regex = /\u{1F}/u; | ||
var regex = /\u{1F}/gui; |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you leave a comment to explain why
0..4
? To me, as a first reader, don't make sense