Skip to content

Commit

Permalink
Rebase
Browse files Browse the repository at this point in the history
  • Loading branch information
charliermarsh committed Nov 3, 2023
1 parent 25bccf0 commit 292bf7f
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,11 @@ def f():
# And here's a comment with a greek alpha: ∗
foo # And here's a comment with an unusual punctuation mark: ᜵
}"

# At runtime the attribute will be stored as Greek small letter mu instead of
# micro sign because of PEP 3131's NFKC normalization
class Labware:
µL = 1.5


assert getattr(Labware(), "µL") == 1.5
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ pub(crate) fn ambiguous_unicode_character(
let candidate = Candidate::new(
TextSize::try_from(relative_offset).unwrap() + range.start(),
current_char,
representant as char,
char::from_u32(representant).unwrap(),
);
if let Some(diagnostic) = candidate.into_diagnostic(context, settings) {
diagnostics.push(diagnostic);
Expand All @@ -178,7 +178,7 @@ pub(crate) fn ambiguous_unicode_character(
word_candidates.push(Candidate::new(
TextSize::try_from(relative_offset).unwrap() + range.start(),
current_char,
representant as char,
char::from_u32(representant).unwrap(),
));
} else {
// The current word contains at least one unambiguous unicode character.
Expand Down
5 changes: 4 additions & 1 deletion crates/ruff_linter/src/rules/ruff/rules/confusables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
/// Via: <https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json>
/// See: <https://github.com/microsoft/vscode/blob/095ddabc52b82498ee7f718a34f9dd11d59099a8/src/vs/base/common/strings.ts#L1094>
pub(crate) fn confusable(c: u32) -> Option<u8> {
pub(crate) fn confusable(c: u32) -> Option<u32> {
let result = match c {
160u32 => 32,
180u32 => 96,
Expand Down Expand Up @@ -1586,6 +1586,9 @@ pub(crate) fn confusable(c: u32) -> Option<u8> {
130_039_u32 => 55,
130_040_u32 => 56,
130_041_u32 => 57,
0x212B => 0x00C5,
0x2126 => 0x03A9,
0x00B5 => 0x03BC,
_ => return None,
};
Some(result)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,4 +155,10 @@ confusables.py:46:62: RUF003 Comment contains ambiguous `᜵` (PHILIPPINE SINGLE
47 | }"
|

confusables.py:55:28: RUF001 String contains ambiguous `µ` (MICRO SIGN). Did you mean `μ` (GREEK SMALL LETTER MU)?
|
55 | assert getattr(Labware(), "µL") == 1.5
| ^ RUF001
|


32 changes: 30 additions & 2 deletions scripts/update_ambiguous_characters.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,44 @@ def get_mapping_data() -> dict:
return json.loads(json.loads(content))


def format_number(number: int) -> str:
"""Underscore-separate the digits of a number."""
# For unknown historical reasons, numbers greater than 100,000 were
# underscore-delimited in the generated file, so we now preserve that property to
# avoid unnecessary churn.
if number > 100000:
number = str(number)
number = "_".join(number[i : i + 3] for i in range(0, len(number), 3))
return f"{number}_u32"
else:
return f"{number}u32"


def format_confusables_rs(raw_data: dict[str, list[int]]) -> str:
"""Format the downloaded data into a Rust source file."""
# The input data contains duplicate entries
# The input data contains duplicate entries.
flattened_items: set[tuple[int, int]] = set()
for _category, items in raw_data.items():
assert len(items) % 2 == 0, "Expected pairs of items"
for i in range(0, len(items), 2):
flattened_items.add((items[i], items[i + 1]))

tuples = [f" {left}u32 => {right},\n" for left, right in sorted(flattened_items)]
tuples = [
f" {format_number(left)} => {right},\n"
for left, right in sorted(flattened_items)
]

# Add some additional confusable pairs that are not included in the VS Code data,
# as they're unicode-to-unicode confusables, not unicode-to-ASCII confusables.
confusable_units = [
# ANGSTROM SIGN → LATIN CAPITAL LETTER A WITH RING ABOVE
("0x212B", "0x00C5"),
# OHM SIGN → GREEK CAPITAL LETTER OMEGA
("0x2126", "0x03A9"),
# MICRO SIGN → GREEK SMALL LETTER MU
("0x00B5", "0x03BC"),
]
tuples += [f" {left} => {right},\n" for left, right in confusable_units]

print(f"{len(tuples)} confusable tuples.")

Expand Down

0 comments on commit 292bf7f

Please sign in to comment.