diff --git a/Lib/test/test_traceback.py b/Lib/test/test_traceback.py index ccc59870c2a952..2881a3489084a0 100644 --- a/Lib/test/test_traceback.py +++ b/Lib/test/test_traceback.py @@ -893,7 +893,62 @@ def f(): f" callable()", f" File \"{__file__}\", line {f.__code__.co_firstlineno + 4}, in f", f" print(1, www(", - f" ^^^^", + f" ^^^^^^^", + ] + self.assertEqual(actual, expected) + + def test_byte_offset_with_wide_characters_term_highlight(self): + def f(): + 说明说明 = 1 + şçöğıĤellö = 0 # not wide but still non-ascii + return 说明说明 / şçöğıĤellö + + actual = self.get_exception(f) + expected = [ + f"Traceback (most recent call last):", + f" File \"{__file__}\", line {self.callable_line}, in get_exception", + f" callable()", + f" File \"{__file__}\", line {f.__code__.co_firstlineno + 3}, in f", + f" return 说明说明 / şçöğıĤellö", + f" ~~~~~~~~~^~~~~~~~~~~~", + ] + self.assertEqual(actual, expected) + + def test_byte_offset_with_emojis_term_highlight(self): + def f(): + return "✨🐍" + func_说明说明("📗🚛", + "📗🚛") + "🐍" + + actual = self.get_exception(f) + expected = [ + f"Traceback (most recent call last):", + f" File \"{__file__}\", line {self.callable_line}, in get_exception", + f" callable()", + f" File \"{__file__}\", line {f.__code__.co_firstlineno + 1}, in f", + f' return "✨🐍" + func_说明说明("📗🚛",', + f" ^^^^^^^^^^^^^", + ] + self.assertEqual(actual, expected) + + def test_byte_offset_wide_chars_subscript(self): + def f(): + my_dct = { + "✨🚛✨": { + "说明": { + "🐍🐍🐍": None + } + } + } + return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"] + + actual = self.get_exception(f) + expected = [ + f"Traceback (most recent call last):", + f" File \"{__file__}\", line {self.callable_line}, in get_exception", + f" callable()", + f" File \"{__file__}\", line {f.__code__.co_firstlineno + 8}, in f", + f' return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]', + f" ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^", ] self.assertEqual(actual, expected) diff --git a/Lib/traceback.py b/Lib/traceback.py index 0e229553cb5d25..ea045e27610d4d 100644 --- a/Lib/traceback.py +++ b/Lib/traceback.py @@ -465,7 +465,8 @@ def format_frame_summary(self, frame_summary): stripped_line = frame_summary.line.strip() row.append(' {}\n'.format(stripped_line)) - orig_line_len = len(frame_summary._original_line) + line = frame_summary._original_line + orig_line_len = len(line) frame_line_len = len(frame_summary.line.lstrip()) stripped_characters = orig_line_len - frame_line_len if ( @@ -473,31 +474,40 @@ def format_frame_summary(self, frame_summary): and frame_summary.end_colno is not None ): start_offset = _byte_offset_to_character_offset( - frame_summary._original_line, frame_summary.colno) + 1 + line, frame_summary.colno) end_offset = _byte_offset_to_character_offset( - frame_summary._original_line, frame_summary.end_colno) + 1 + line, frame_summary.end_colno) + code_segment = line[start_offset:end_offset] anchors = None if frame_summary.lineno == frame_summary.end_lineno: with suppress(Exception): - anchors = _extract_caret_anchors_from_line_segment( - frame_summary._original_line[start_offset - 1:end_offset - 1] - ) + anchors = _extract_caret_anchors_from_line_segment(code_segment) else: - end_offset = stripped_characters + len(stripped_line) + # Don't count the newline since the anchors only need to + # go up until the last character of the line. + end_offset = len(line.rstrip()) # show indicators if primary char doesn't span the frame line if end_offset - start_offset < len(stripped_line) or ( anchors and anchors.right_start_offset - anchors.left_end_offset > 0): + # When showing this on a terminal, some of the non-ASCII characters + # might be rendered as double-width characters, so we need to take + # that into account when calculating the length of the line. + dp_start_offset = _display_width(line, start_offset) + 1 + dp_end_offset = _display_width(line, end_offset) + 1 + row.append(' ') - row.append(' ' * (start_offset - stripped_characters)) + row.append(' ' * (dp_start_offset - stripped_characters)) if anchors: - row.append(anchors.primary_char * (anchors.left_end_offset)) - row.append(anchors.secondary_char * (anchors.right_start_offset - anchors.left_end_offset)) - row.append(anchors.primary_char * (end_offset - start_offset - anchors.right_start_offset)) + dp_left_end_offset = _display_width(code_segment, anchors.left_end_offset) + dp_right_start_offset = _display_width(code_segment, anchors.right_start_offset) + row.append(anchors.primary_char * dp_left_end_offset) + row.append(anchors.secondary_char * (dp_right_start_offset - dp_left_end_offset)) + row.append(anchors.primary_char * (dp_end_offset - dp_start_offset - dp_right_start_offset)) else: - row.append('^' * (end_offset - start_offset)) + row.append('^' * (dp_end_offset - dp_start_offset)) row.append('\n') @@ -618,6 +628,25 @@ def _extract_caret_anchors_from_line_segment(segment): return None +_WIDE_CHAR_SPECIFIERS = "WF" + +def _display_width(line, offset): + """Calculate the extra amount of width space the given source + code segment might take if it were to be displayed on a fixed + width output device. Supports wide unicode characters and emojis.""" + + # Fast track for ASCII-only strings + if line.isascii(): + return offset + + import unicodedata + + return sum( + 2 if unicodedata.east_asian_width(char) in _WIDE_CHAR_SPECIFIERS else 1 + for char in line[:offset] + ) + + class _ExceptionPrintContext: def __init__(self): diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-10-26-15-34-11.gh-issue-88116.W9-vaQ.rst b/Misc/NEWS.d/next/Core and Builtins/2023-10-26-15-34-11.gh-issue-88116.W9-vaQ.rst new file mode 100644 index 00000000000000..12257ef2b0b9b0 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-10-26-15-34-11.gh-issue-88116.W9-vaQ.rst @@ -0,0 +1,3 @@ +Traceback location ranges involving wide unicode characters (like emoji and +asian characters) now are properly highlighted. Patch by Batuhan Taskaya and +Pablo Galindo. diff --git a/Parser/pegen.c b/Parser/pegen.c index 87b47bacec553f..3b85b095beb235 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -38,6 +38,61 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset) return size; } +// Calculate the extra amount of width space the given source +// code segment might take if it were to be displayed on a fixed +// width output device. Supports wide unicode characters and emojis. +Py_ssize_t +_PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset) +{ + PyObject *segment = PyUnicode_Substring(line, 0, character_offset); + if (!segment) { + return -1; + } + + // Fast track for ascii strings + if (PyUnicode_IS_ASCII(segment)) { + Py_DECREF(segment); + return character_offset; + } + + PyObject *width_fn = _PyImport_GetModuleAttrString("unicodedata", "east_asian_width"); + if (!width_fn) { + return -1; + } + + Py_ssize_t width = 0; + Py_ssize_t len = PyUnicode_GET_LENGTH(segment); + for (Py_ssize_t i = 0; i < len; i++) { + PyObject *chr = PyUnicode_Substring(segment, i, i + 1); + if (!chr) { + Py_DECREF(segment); + Py_DECREF(width_fn); + return -1; + } + + PyObject *width_specifier = PyObject_CallOneArg(width_fn, chr); + Py_DECREF(chr); + if (!width_specifier) { + Py_DECREF(segment); + Py_DECREF(width_fn); + return -1; + } + + if (_PyUnicode_EqualToASCIIString(width_specifier, "W") || + _PyUnicode_EqualToASCIIString(width_specifier, "F")) { + width += 2; + } + else { + width += 1; + } + Py_DECREF(width_specifier); + } + + Py_DECREF(segment); + Py_DECREF(width_fn); + return width; +} + // Here, mark is the start of the node, while p->mark is the end. // If node==NULL, they should be the same. int diff --git a/Parser/pegen.h b/Parser/pegen.h index fe0c327b875566..2c4b2c3dfc65c6 100644 --- a/Parser/pegen.h +++ b/Parser/pegen.h @@ -143,6 +143,7 @@ expr_ty _PyPegen_name_token(Parser *p); expr_ty _PyPegen_number_token(Parser *p); void *_PyPegen_string_token(Parser *p); Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset); +Py_ssize_t _PyPegen_calculate_display_width(PyObject *segment, Py_ssize_t character_offset); // Error handling functions and APIs typedef enum { diff --git a/Python/traceback.c b/Python/traceback.c index c4f5ec877bba5d..130f945c290234 100644 --- a/Python/traceback.c +++ b/Python/traceback.c @@ -907,8 +907,39 @@ tb_displayline(PyTracebackObject* tb, PyObject *f, PyObject *filename, int linen goto done; } - if (print_error_location_carets(f, truncation, start_offset, end_offset, - right_start_offset, left_end_offset, + // Convert all offsets to display offsets (e.g. the space they would take up if printed + // on the screen). + Py_ssize_t dp_start = _PyPegen_calculate_display_width(source_line, start_offset); + if (dp_start < 0) { + err = ignore_source_errors() < 0; + goto done; + } + + Py_ssize_t dp_end = _PyPegen_calculate_display_width(source_line, end_offset); + if (dp_end < 0) { + err = ignore_source_errors() < 0; + goto done; + } + + Py_ssize_t dp_left_end = -1; + Py_ssize_t dp_right_start = -1; + if (has_secondary_ranges) { + dp_left_end = _PyPegen_calculate_display_width(source_line, left_end_offset); + if (dp_left_end < 0) { + err = ignore_source_errors() < 0; + goto done; + } + + dp_right_start = _PyPegen_calculate_display_width(source_line, right_start_offset); + if (dp_right_start < 0) { + err = ignore_source_errors() < 0; + goto done; + } + } + + + if (print_error_location_carets(f, truncation, dp_start, dp_end, + dp_right_start, dp_left_end, primary_error_char, secondary_error_char) < 0) { err = -1; goto done;