From eabff673b37c5430d4cf72fa050a189a57be2deb Mon Sep 17 00:00:00 2001 From: Shivansh-007 Date: Sun, 22 Jan 2023 18:51:09 +0530 Subject: [PATCH] Format hex code in unicode escape sequences in string literals (#2916) Co-authored-by: Jelle Zijlstra --- CHANGES.md | 1 + src/black/linegen.py | 4 ++ src/black/mode.py | 1 + src/black/strings.py | 44 ++++++++++++++++++- .../data/preview/format_unicode_escape_seq.py | 33 ++++++++++++++ 5 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 tests/data/preview/format_unicode_escape_seq.py diff --git a/CHANGES.md b/CHANGES.md index 1450278341b..e2e4b341761 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -16,6 +16,7 @@ +- Format hex code in unicode escape sequences in string literals (#2916) - Add parentheses around `if`-`else` expressions (#2278) - Improve the performance on large expressions that contain many strings (#3467) - Fix a crash in preview style with assert + parenthesized string (#3415) diff --git a/src/black/linegen.py b/src/black/linegen.py index 2f50257a930..bfc28ca006c 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -59,6 +59,7 @@ get_string_prefix, normalize_string_prefix, normalize_string_quotes, + normalize_unicode_escape_sequences, ) from black.trans import ( CannotTransform, @@ -368,6 +369,9 @@ def visit_factor(self, node: Node) -> Iterator[Line]: yield from self.visit_default(node) def visit_STRING(self, leaf: Leaf) -> Iterator[Line]: + if Preview.hex_codes_in_unicode_sequences in self.mode: + normalize_unicode_escape_sequences(leaf) + if is_docstring(leaf) and "\\\n" not in leaf.value: # We're ignoring docstrings with backslash newline escapes because changing # indentation of those changes the AST representation of the code. diff --git a/src/black/mode.py b/src/black/mode.py index af0706e6a0b..4309d4fa635 100644 --- a/src/black/mode.py +++ b/src/black/mode.py @@ -153,6 +153,7 @@ def supports_feature(target_versions: Set[TargetVersion], feature: Feature) -> b class Preview(Enum): """Individual preview style features.""" + hex_codes_in_unicode_sequences = auto() annotation_parens = auto() empty_lines_before_class_or_def_with_leading_comments = auto() handle_trailing_commas_in_head = auto() diff --git a/src/black/strings.py b/src/black/strings.py index 9d0e2eb8430..3e3bc12fe72 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -5,7 +5,9 @@ import re import sys from functools import lru_cache -from typing import List, Pattern +from typing import List, Match, Pattern + +from blib2to3.pytree import Leaf if sys.version_info < (3, 8): from typing_extensions import Final @@ -18,6 +20,15 @@ r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL ) FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") +UNICODE_ESCAPE_RE: Final = re.compile( + r"(?P\\+)(?P" + r"(u(?P[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx + r"|(U(?P[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx + r"|(x(?P[a-fA-F0-9]{2}))" # Character with hex value hh + r"|(N\{(?P[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database + r")", + re.VERBOSE, +) def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str: @@ -236,3 +247,34 @@ def normalize_string_quotes(s: str) -> str: return s # Prefer double quotes return f"{prefix}{new_quote}{new_body}{new_quote}" + + +def normalize_unicode_escape_sequences(leaf: Leaf) -> None: + """Replace hex codes in Unicode escape sequences with lowercase representation.""" + text = leaf.value + prefix = get_string_prefix(text) + if "r" in prefix.lower(): + return + + def replace(m: Match[str]) -> str: + groups = m.groupdict() + back_slashes = groups["backslashes"] + + if len(back_slashes) % 2 == 0: + return back_slashes + groups["body"] + + if groups["u"]: + # \u + return back_slashes + "u" + groups["u"].lower() + elif groups["U"]: + # \U + return back_slashes + "U" + groups["U"].lower() + elif groups["x"]: + # \x + return back_slashes + "x" + groups["x"].lower() + else: + assert groups["N"], f"Unexpected match: {m}" + # \N{} + return back_slashes + "N{" + groups["N"].upper() + "}" + + leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text) diff --git a/tests/data/preview/format_unicode_escape_seq.py b/tests/data/preview/format_unicode_escape_seq.py new file mode 100644 index 00000000000..3440696c303 --- /dev/null +++ b/tests/data/preview/format_unicode_escape_seq.py @@ -0,0 +1,33 @@ +x = "\x1F" +x = "\\x1B" +x = "\\\x1B" +x = "\U0001F60E" +x = "\u0001F60E" +x = r"\u0001F60E" +x = "don't format me" +x = "\xA3" +x = "\u2717" +x = "\uFaCe" +x = "\N{ox}\N{OX}" +x = "\N{lAtIn smaLL letteR x}" +x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}" +x = b"\x1Fdon't byte" +x = rb"\x1Fdon't format" + +# output + +x = "\x1f" +x = "\\x1B" +x = "\\\x1b" +x = "\U0001f60e" +x = "\u0001F60E" +x = r"\u0001F60E" +x = "don't format me" +x = "\xa3" +x = "\u2717" +x = "\uface" +x = "\N{OX}\N{OX}" +x = "\N{LATIN SMALL LETTER X}" +x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" +x = b"\x1fdon't byte" +x = rb"\x1Fdon't format"