Skip to content

Commit

Permalink
[eradicate] Better detection of IntelliJ language injection comment…
Browse files Browse the repository at this point in the history
…s (`ERA001`) (#14094)
  • Loading branch information
InSyncWithFoo authored Nov 6, 2024
1 parent 31681f6 commit 46c5a13
Showing 1 changed file with 66 additions and 6 deletions.
72 changes: 66 additions & 6 deletions crates/ruff_linter/src/rules/eradicate/detection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,43 @@ static CODE_INDICATORS: LazyLock<AhoCorasick> = LazyLock::new(|| {

static ALLOWLIST_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"^(?i)(?:pylint|pyright|noqa|nosec|region|endregion|type:\s*ignore|fmt:\s*(on|off)|isort:\s*(on|off|skip|skip_file|split|dont-add-imports(:\s*\[.*?])?)|mypy:|SPDX-License-Identifier:|language=[a-zA-Z](?: ?[-_.a-zA-Z0-9]+)+(?:\s+prefix=\S+)?(?:\s+suffix=\S+)?|(?:en)?coding[:=][ \t]*([-_.a-zA-Z0-9]+))",
).unwrap()
r"(?x)
^
(?:
# Case-sensitive
pyright
| mypy:
| type:\s*ignore
| SPDX-License-Identifier:
| fmt:\s*(on|off|skip)
| region|endregion
# Case-insensitive
| (?i:
noqa
)
# Unknown case sensitivity
| (?i:
pylint
| nosec
| isort:\s*(on|off|skip|skip_file|split|dont-add-imports(:\s*\[.*?])?)
| (?:en)?coding[:=][\x20\t]*([-_.A-Z0-9]+)
)
# IntelliJ language injection comments:
# * `language` must be lowercase.
# * No spaces around `=`.
# * Language IDs as used in comments must have no spaces,
# though to IntelliJ they can be anything.
# * May optionally contain `prefix=` and/or `suffix=`,
# not declared here since we use `.is_match()`.
| language=[-_.a-zA-Z0-9]+
)
",
)
.unwrap()
});

static HASH_NUMBER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"#\d").unwrap());
Expand Down Expand Up @@ -299,17 +334,42 @@ mod tests {

#[test]
fn comment_contains_language_injection() {
assert!(comment_contains_code("# language=123", &[]));
// `language` with bad casing
assert!(comment_contains_code("# Language=C#", &[]));
assert!(comment_contains_code("# lAngUAgE=inI", &[]));

// Unreasonable language IDs, possibly literals
assert!(comment_contains_code("# language=\"pt\"", &[]));
assert!(comment_contains_code("# language='en'", &[]));

assert!(!comment_contains_code("# language=xml", &[]));
// Spaces around equal sign
assert!(comment_contains_code("# language =xml", &[]));
assert!(comment_contains_code("# language= html", &[]));
assert!(comment_contains_code("# language = RegExp", &[]));

// Leading whitespace
assert!(!comment_contains_code("#language=CSS", &[]));
assert!(!comment_contains_code("# \t language=C++", &[]));

// Human language false negatives
assert!(!comment_contains_code("# language=en", &[]));
assert!(!comment_contains_code("# language=en-US", &[]));

// Casing (fine because such IDs cannot be validated)
assert!(!comment_contains_code("# language=PytHoN", &[]));
assert!(!comment_contains_code("# language=jaVaScrIpt", &[]));

// Space within ID (fine because `Shell` is considered the ID)
assert!(!comment_contains_code("# language=Shell Script", &[]));

// With prefix and/or suffix
assert!(!comment_contains_code("# language=HTML prefix=<body>", &[]));
assert!(!comment_contains_code(
"# language=HTML prefix=<body> suffix=</body>",
r"# language=Requirements suffix=\n",
&[]
));
assert!(!comment_contains_code(
"# language=ecma script level 4",
"language=javascript prefix=(function(){ suffix=})()",
&[]
));
}
Expand Down

0 comments on commit 46c5a13

Please sign in to comment.