From 6ea78e33b599605b0c1c35fbbc2df493d6dfc80a Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Tue, 14 Feb 2017 06:30:46 +0100 Subject: [PATCH 1/8] bpo-24665: double-width CJK chars support for textwrap * Add ckj option flag, default to False * Add cjkwide(), cjklen() and cjkslices() utilities --- Lib/test/test_textwrap.py | 12 +++++++ Lib/textwrap.py | 69 ++++++++++++++++++++++++++++++++------- Misc/ACKS | 1 + 3 files changed, 70 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index 5a33c151642c62..4a53d3fa7695f8 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -566,6 +566,10 @@ def setUp(self): self.text = '''\ Did you say "supercalifragilisticexpialidocious?" How *do* you spell that odd word, anyways? +''' + self.text_cjk = '''\ +Did you say "いろはにほへとちりぬるをいろはにほ?" +How りぬ るをいろはにほり ぬるは, anyways? ''' def test_break_long(self): @@ -579,6 +583,14 @@ def test_break_long(self): self.check_wrap(self.text, 50, ['Did you say "supercalifragilisticexpialidocious?"', 'How *do* you spell that odd word, anyways?']) + self.check_wrap(self.text_cjk, 30, + ['Did you say "いろはにほへとち', + 'りぬるをいろはにほ?" How りぬ', + 'るをいろはにほり ぬるは,', + 'anyways?'], cjk=True) + self.check_wrap(self.text_cjk, 50, + ['Did you say "いろはにほへとちりぬるをいろはにほ?"', + 'How りぬ るをいろはにほり ぬるは, anyways?'], cjk=True) # SF bug 797650. Prevent an infinite loop by making sure that at # least one character gets split off on every pass. diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 0c18dc582e17ae..fef5ce6c92ca1b 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -5,9 +5,10 @@ # Copyright (C) 2002, 2003 Python Software Foundation. # Written by Greg Ward -import re +import re, unicodedata -__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten'] +__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten', + 'cjkwide', 'cjklen', 'cjkslices'] # Hardcode the recognized whitespace characters to the US-ASCII # whitespace characters. The main reason for doing this is that @@ -26,6 +27,8 @@ class TextWrapper: width (default: 70) the maximum width of wrapped lines (unless break_long_words is false) + cjk (default: False) + Handle double-width CJK chars. initial_indent (default: "") string that will be prepended to the first line of wrapped output. Counts towards the line's width. @@ -114,6 +117,7 @@ class TextWrapper: def __init__(self, width=70, + cjk=False, initial_indent="", subsequent_indent="", expand_tabs=True, @@ -127,6 +131,7 @@ def __init__(self, max_lines=None, placeholder=' [...]'): self.width = width + self.cjk = cjk self.initial_indent = initial_indent self.subsequent_indent = subsequent_indent self.expand_tabs = expand_tabs @@ -139,6 +144,7 @@ def __init__(self, self.max_lines = max_lines self.placeholder = placeholder + self.len = cjklen if self.cjk else len # -- Private methods ----------------------------------------------- # (possibly useful for subclasses to override) @@ -215,8 +221,13 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): # If we're allowed to break long words, then do so: put as much # of the next chunk onto the current line as will fit. if self.break_long_words: - cur_line.append(reversed_chunks[-1][:space_left]) - reversed_chunks[-1] = reversed_chunks[-1][space_left:] + if self.cjk: + chunk_start, chunk_end = cjkslices(reversed_chunks[-1], space_left) + cur_line.append(chunk_start) + reversed_chunks[-1] = chunk_end + else: + cur_line.append(reversed_chunks[-1][:space_left]) + reversed_chunks[-1] = reversed_chunks[-1][space_left:] # Otherwise, we have to preserve the long word intact. Only add # it to the current line if there's nothing already there -- @@ -246,6 +257,9 @@ def _wrap_chunks(self, chunks): lines = [] if self.width <= 0: raise ValueError("invalid width %r (must be > 0)" % self.width) + elif self.width == 1 and (sum(self.len(chunk) for chunk in chunks) > + sum(len(chunk) for chunk in chunks)): + raise ValueError("invalid width 1 (must be > 1 when CJK chars)") if self.max_lines is not None: if self.max_lines > 1: indent = self.subsequent_indent @@ -280,7 +294,7 @@ def _wrap_chunks(self, chunks): del chunks[-1] while chunks: - l = len(chunks[-1]) + l = self.len(chunks[-1]) # Can at least squeeze this chunk onto the current line. if cur_len + l <= width: @@ -293,7 +307,7 @@ def _wrap_chunks(self, chunks): # The current line is full, and the next chunk is too big to # fit on *any* line (not just this one). - if chunks and len(chunks[-1]) > width: + if chunks and self.len(chunks[-1]) > width: self._handle_long_word(chunks, cur_line, cur_len, width) cur_len = sum(map(len, cur_line)) @@ -365,7 +379,7 @@ def fill(self, text): # -- Convenience interface --------------------------------------------- -def wrap(text, width=70, **kwargs): +def wrap(text, width=70, cjk=False, **kwargs): """Wrap a single paragraph of text, returning a list of wrapped lines. Reformat the single paragraph in 'text' so it fits in lines of no @@ -375,10 +389,10 @@ def wrap(text, width=70, **kwargs): space. See TextWrapper class for available keyword args to customize wrapping behaviour. """ - w = TextWrapper(width=width, **kwargs) + w = TextWrapper(width=width, cjk=cjk, **kwargs) return w.wrap(text) -def fill(text, width=70, **kwargs): +def fill(text, width=70, cjk=False, **kwargs): """Fill a single paragraph of text, returning a new string. Reformat the single paragraph in 'text' to fit in lines of no more @@ -387,10 +401,10 @@ def fill(text, width=70, **kwargs): whitespace characters converted to space. See TextWrapper class for available keyword args to customize wrapping behaviour. """ - w = TextWrapper(width=width, **kwargs) + w = TextWrapper(width=width, cjk=cjk, **kwargs) return w.fill(text) -def shorten(text, width, **kwargs): +def shorten(text, width, cjk=False, **kwargs): """Collapse and truncate the given text to fit in the given width. The text first has its whitespace collapsed. If it then fits in @@ -402,10 +416,41 @@ def shorten(text, width, **kwargs): >>> textwrap.shorten("Hello world!", width=11) 'Hello [...]' """ - w = TextWrapper(width=width, max_lines=1, **kwargs) + w = TextWrapper(width=width, cjk=cjk, max_lines=1, **kwargs) return w.fill(' '.join(text.strip().split())) +# -- CJK support ------------------------------------------------------ + +def cjkwide(char): + """Return True if char is Fullwidth or Wide, False otherwise. + Fullwidth and Wide CJK chars are double-width. + """ + return unicodedata.east_asian_width(char) in ('F', 'W') + + +def cjklen(text): + """Return the real width of text (its len if not a string). + """ + if not isinstance(text, str): + return len(text) + return sum(2 if cjkwide(char) else 1 for char in text) + + +def cjkslices(text, index): + """Return the two slices of text cut to the index. + """ + if not isinstance(text, str): + return text[:index], text[index:] + if cjklen(text) <= index: + return text, '' + i = 1 + # <= and i-1 to catch the last double length char of odd line + while cjklen(text[:i]) <= index: + i = i + 1 + return text[:i-1], text[i-1:] + + # -- Loosely related functionality ------------------------------------- _whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE) diff --git a/Misc/ACKS b/Misc/ACKS index 319128c9e9a4d4..127b0811307c2a 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -495,6 +495,7 @@ Lele Gaifax Santiago Gala Yitzchak Gale Matthew Gallagher +Florent Gallaire Quentin Gallet-Gilles Riccardo Attilio Galli Raymund Galvin From aa94f2635bb2273cae0287f89340ab2551680ee1 Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Tue, 14 Feb 2017 09:58:43 +0100 Subject: [PATCH 2/8] Fix TextWrapper positionnal arguments --- Lib/textwrap.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index fef5ce6c92ca1b..079d4313ce40aa 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -3,6 +3,7 @@ # Copyright (C) 1999-2001 Gregory P. Ward. # Copyright (C) 2002, 2003 Python Software Foundation. +# Copyright (C) 2015-2017 Florent Gallaire # Written by Greg Ward import re, unicodedata @@ -27,8 +28,6 @@ class TextWrapper: width (default: 70) the maximum width of wrapped lines (unless break_long_words is false) - cjk (default: False) - Handle double-width CJK chars. initial_indent (default: "") string that will be prepended to the first line of wrapped output. Counts towards the line's width. @@ -64,6 +63,8 @@ class TextWrapper: Truncate wrapped lines. placeholder (default: ' [...]') Append to the last line of truncated text. + cjk (default: false) + Handle double-width CJK chars. """ unicode_whitespace_trans = {} @@ -117,7 +118,6 @@ class TextWrapper: def __init__(self, width=70, - cjk=False, initial_indent="", subsequent_indent="", expand_tabs=True, @@ -129,9 +129,9 @@ def __init__(self, tabsize=8, *, max_lines=None, - placeholder=' [...]'): + placeholder=' [...]', + cjk=False): self.width = width - self.cjk = cjk self.initial_indent = initial_indent self.subsequent_indent = subsequent_indent self.expand_tabs = expand_tabs @@ -143,6 +143,7 @@ def __init__(self, self.tabsize = tabsize self.max_lines = max_lines self.placeholder = placeholder + self.cjk = cjk self.len = cjklen if self.cjk else len From 0264d9dd24cb9d33877d3ae40346076ed6bf0a20 Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Tue, 14 Feb 2017 11:26:16 +0100 Subject: [PATCH 3/8] Fix one import per line --- Lib/textwrap.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 079d4313ce40aa..628fce895bd5f0 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -6,7 +6,8 @@ # Copyright (C) 2015-2017 Florent Gallaire # Written by Greg Ward -import re, unicodedata +import re +import unicodedata __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten', 'cjkwide', 'cjklen', 'cjkslices'] From d630821bd37b929e910618c8913e14efaac1356e Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Tue, 14 Feb 2017 11:31:31 +0100 Subject: [PATCH 4/8] Rename self.len() in self._width() --- Lib/textwrap.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 628fce895bd5f0..1c1c196e024816 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -146,7 +146,7 @@ def __init__(self, self.placeholder = placeholder self.cjk = cjk - self.len = cjklen if self.cjk else len + self._width = cjklen if self.cjk else len # -- Private methods ----------------------------------------------- # (possibly useful for subclasses to override) @@ -259,7 +259,7 @@ def _wrap_chunks(self, chunks): lines = [] if self.width <= 0: raise ValueError("invalid width %r (must be > 0)" % self.width) - elif self.width == 1 and (sum(self.len(chunk) for chunk in chunks) > + elif self.width == 1 and (sum(self._width(chunk) for chunk in chunks) > sum(len(chunk) for chunk in chunks)): raise ValueError("invalid width 1 (must be > 1 when CJK chars)") if self.max_lines is not None: @@ -296,7 +296,7 @@ def _wrap_chunks(self, chunks): del chunks[-1] while chunks: - l = self.len(chunks[-1]) + l = self._width(chunks[-1]) # Can at least squeeze this chunk onto the current line. if cur_len + l <= width: @@ -309,7 +309,7 @@ def _wrap_chunks(self, chunks): # The current line is full, and the next chunk is too big to # fit on *any* line (not just this one). - if chunks and self.len(chunks[-1]) > width: + if chunks and self._width(chunks[-1]) > width: self._handle_long_word(chunks, cur_line, cur_len, width) cur_len = sum(map(len, cur_line)) From bfdfb22b7e4c33590a5aaaf82288cd7cf83c06bc Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Tue, 14 Feb 2017 11:37:29 +0100 Subject: [PATCH 5/8] Rename CJK functions with _ --- Lib/textwrap.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 1c1c196e024816..66fafc42f18401 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -10,7 +10,7 @@ import unicodedata __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten', - 'cjkwide', 'cjklen', 'cjkslices'] + 'cjk_wide', 'cjk_len', 'cjk_slices'] # Hardcode the recognized whitespace characters to the US-ASCII # whitespace characters. The main reason for doing this is that @@ -146,7 +146,7 @@ def __init__(self, self.placeholder = placeholder self.cjk = cjk - self._width = cjklen if self.cjk else len + self._width = cjk_len if self.cjk else len # -- Private methods ----------------------------------------------- # (possibly useful for subclasses to override) @@ -224,7 +224,7 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): # of the next chunk onto the current line as will fit. if self.break_long_words: if self.cjk: - chunk_start, chunk_end = cjkslices(reversed_chunks[-1], space_left) + chunk_start, chunk_end = cjk_slices(reversed_chunks[-1], space_left) cur_line.append(chunk_start) reversed_chunks[-1] = chunk_end else: @@ -424,31 +424,31 @@ def shorten(text, width, cjk=False, **kwargs): # -- CJK support ------------------------------------------------------ -def cjkwide(char): +def cjk_wide(char): """Return True if char is Fullwidth or Wide, False otherwise. Fullwidth and Wide CJK chars are double-width. """ return unicodedata.east_asian_width(char) in ('F', 'W') -def cjklen(text): +def cjk_len(text): """Return the real width of text (its len if not a string). """ if not isinstance(text, str): return len(text) - return sum(2 if cjkwide(char) else 1 for char in text) + return sum(2 if cjk_wide(char) else 1 for char in text) -def cjkslices(text, index): +def cjk_slices(text, index): """Return the two slices of text cut to the index. """ if not isinstance(text, str): return text[:index], text[index:] - if cjklen(text) <= index: + if cjk_len(text) <= index: return text, '' i = 1 # <= and i-1 to catch the last double length char of odd line - while cjklen(text[:i]) <= index: + while cjk_len(text[:i]) <= index: i = i + 1 return text[:i-1], text[i-1:] From 8337ce50e71ba46a4dcc473d9634a5b9ddda7dca Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Wed, 15 Feb 2017 00:34:46 +0100 Subject: [PATCH 6/8] Improve cjk_slices() complexity from O(n^2) to O(n) (Thanks to INADA Naoki) --- Lib/textwrap.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 66fafc42f18401..559896d6aad48d 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -446,11 +446,12 @@ def cjk_slices(text, index): return text[:index], text[index:] if cjk_len(text) <= index: return text, '' - i = 1 - # <= and i-1 to catch the last double length char of odd line - while cjk_len(text[:i]) <= index: - i = i + 1 - return text[:i-1], text[i-1:] + width = 0 + for i, char in enumerate(text): + width = width + cjk_wide(char) + 1 + if width > index: + break + return text[:i], text[i:] # -- Loosely related functionality ------------------------------------- From cb9812bada4b96806873b8818757828fe6985d58 Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Wed, 15 Feb 2017 02:04:04 +0100 Subject: [PATCH 7/8] Add Doc for new CJK option and functions --- Doc/library/textwrap.rst | 29 +++++++++++++++++++++++++++++ Lib/textwrap.py | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/Doc/library/textwrap.rst b/Doc/library/textwrap.rst index 438007d0028d86..bbb87ed14ef8aa 100644 --- a/Doc/library/textwrap.rst +++ b/Doc/library/textwrap.rst @@ -117,6 +117,28 @@ functions should be good enough; otherwise, you should use an instance of .. versionadded:: 3.3 +.. function:: cjk_wide(char) + + Return ``True`` if *char* is Fullwidth or Wide, ``False`` otherwise. + Fullwidth and Wide CJK chars are double-width. + + .. versionadded:: 3.7 + + +.. function:: cjk_len(text) + + Return the real width of *text* (its len if not a string). + + .. versionadded:: 3.7 + + +.. function:: cjk_slices(text, index) + + Return the two slices of *text* cut to *index*. + + .. versionadded:: 3.7 + + :func:`wrap`, :func:`fill` and :func:`shorten` work by creating a :class:`TextWrapper` instance and calling a single method on it. That instance is not reused, so for applications that process many text @@ -276,6 +298,13 @@ hyphenated words; only then will long words be broken if necessary, unless .. versionadded:: 3.4 + .. attribute:: cjk + + (default: ``False``) Handle double-width CJK chars. + + .. versionadded:: 3.7 + + :class:`TextWrapper` also provides some public methods, analogous to the module-level convenience functions: diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 559896d6aad48d..2ad2a4f3b4b69b 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -440,7 +440,7 @@ def cjk_len(text): def cjk_slices(text, index): - """Return the two slices of text cut to the index. + """Return the two slices of text cut to index. """ if not isinstance(text, str): return text[:index], text[index:] From 54de7aa6c6fffe9b2248153051a24b0e658bf665 Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Wed, 15 Feb 2017 03:25:48 +0100 Subject: [PATCH 8/8] Fix Python build problems --- Lib/idlelib/idle_test/test_calltips.py | 2 +- Lib/textwrap.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/idlelib/idle_test/test_calltips.py b/Lib/idlelib/idle_test/test_calltips.py index 0b11602ca9e414..1d06e0d0b5b578 100644 --- a/Lib/idlelib/idle_test/test_calltips.py +++ b/Lib/idlelib/idle_test/test_calltips.py @@ -72,7 +72,7 @@ def test_signature_wrap(self): (width=70, initial_indent='', subsequent_indent='', expand_tabs=True, replace_whitespace=True, fix_sentence_endings=False, break_long_words=True, drop_whitespace=True, break_on_hyphens=True, tabsize=8, *, max_lines=None, - placeholder=' [...]')''') + placeholder=' [...]', cjk=False)''') def test_docline_truncation(self): def f(): pass diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 2ad2a4f3b4b69b..1c6146abdee03d 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -7,7 +7,6 @@ # Written by Greg Ward import re -import unicodedata __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten', 'cjk_wide', 'cjk_len', 'cjk_slices'] @@ -428,6 +427,7 @@ def cjk_wide(char): """Return True if char is Fullwidth or Wide, False otherwise. Fullwidth and Wide CJK chars are double-width. """ + import unicodedata return unicodedata.east_asian_width(char) in ('F', 'W')