From 6ea78e33b599605b0c1c35fbbc2df493d6dfc80a Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Tue, 14 Feb 2017 06:30:46 +0100 Subject: [PATCH 1/8] bpo-24665: double-width CJK chars support for textwrap * Add ckj option flag, default to False * Add cjkwide(), cjklen() and cjkslices() utilities --- Lib/test/test_textwrap.py | 12 +++++++ Lib/textwrap.py | 69 ++++++++++++++++++++++++++++++++------- Misc/ACKS | 1 + 3 files changed, 70 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index 5a33c151642c62..4a53d3fa7695f8 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -566,6 +566,10 @@ def setUp(self): self.text = '''\ Did you say "supercalifragilisticexpialidocious?" How *do* you spell that odd word, anyways? +''' + self.text_cjk = '''\ +Did you say "いろはにほへとちりぬるをいろはにほ?" +How りぬ るをいろはにほり ぬるは, anyways? ''' def test_break_long(self): @@ -579,6 +583,14 @@ def test_break_long(self): self.check_wrap(self.text, 50, ['Did you say "supercalifragilisticexpialidocious?"', 'How *do* you spell that odd word, anyways?']) + self.check_wrap(self.text_cjk, 30, + ['Did you say "いろはにほへとち', + 'りぬるをいろはにほ?" How りぬ', + 'るをいろはにほり ぬるは,', + 'anyways?'], cjk=True) + self.check_wrap(self.text_cjk, 50, + ['Did you say "いろはにほへとちりぬるをいろはにほ?"', + 'How りぬ るをいろはにほり ぬるは, anyways?'], cjk=True) # SF bug 797650. Prevent an infinite loop by making sure that at # least one character gets split off on every pass. diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 0c18dc582e17ae..fef5ce6c92ca1b 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -5,9 +5,10 @@ # Copyright (C) 2002, 2003 Python Software Foundation. # Written by Greg Ward -import re +import re, unicodedata -__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten'] +__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten', + 'cjkwide', 'cjklen', 'cjkslices'] # Hardcode the recognized whitespace characters to the US-ASCII # whitespace characters. The main reason for doing this is that @@ -26,6 +27,8 @@ class TextWrapper: width (default: 70) the maximum width of wrapped lines (unless break_long_words is false) + cjk (default: False) + Handle double-width CJK chars. initial_indent (default: "") string that will be prepended to the first line of wrapped output. Counts towards the line's width. @@ -114,6 +117,7 @@ class TextWrapper: def __init__(self, width=70, + cjk=False, initial_indent="", subsequent_indent="", expand_tabs=True, @@ -127,6 +131,7 @@ def __init__(self, max_lines=None, placeholder=' [...]'): self.width = width + self.cjk = cjk self.initial_indent = initial_indent self.subsequent_indent = subsequent_indent self.expand_tabs = expand_tabs @@ -139,6 +144,7 @@ def __init__(self, self.max_lines = max_lines self.placeholder = placeholder + self.len = cjklen if self.cjk else len # -- Private methods ----------------------------------------------- # (possibly useful for subclasses to override) @@ -215,8 +221,13 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): # If we're allowed to break long words, then do so: put as much # of the next chunk onto the current line as will fit. if self.break_long_words: - cur_line.append(reversed_chunks[-1][:space_left]) - reversed_chunks[-1] = reversed_chunks[-1][space_left:] + if self.cjk: + chunk_start, chunk_end = cjkslices(reversed_chunks[-1], space_left) + cur_line.append(chunk_start) + reversed_chunks[-1] = chunk_end + else: + cur_line.append(reversed_chunks[-1][:space_left]) + reversed_chunks[-1] = reversed_chunks[-1][space_left:] # Otherwise, we have to preserve the long word intact. Only add # it to the current line if there's nothing already there -- @@ -246,6 +257,9 @@ def _wrap_chunks(self, chunks): lines = [] if self.width <= 0: raise ValueError("invalid width %r (must be > 0)" % self.width) + elif self.width == 1 and (sum(self.len(chunk) for chunk in chunks) > + sum(len(chunk) for chunk in chunks)): + raise ValueError("invalid width 1 (must be > 1 when CJK chars)") if self.max_lines is not None: if self.max_lines > 1: indent = self.subsequent_indent @@ -280,7 +294,7 @@ def _wrap_chunks(self, chunks): del chunks[-1] while chunks: - l = len(chunks[-1]) + l = self.len(chunks[-1]) # Can at least squeeze this chunk onto the current line. if cur_len + l <= width: @@ -293,7 +307,7 @@ def _wrap_chunks(self, chunks): # The current line is full, and the next chunk is too big to # fit on *any* line (not just this one). - if chunks and len(chunks[-1]) > width: + if chunks and self.len(chunks[-1]) > width: self._handle_long_word(chunks, cur_line, cur_len, width) cur_len = sum(map(len, cur_line)) @@ -365,7 +379,7 @@ def fill(self, text): # -- Convenience interface --------------------------------------------- -def wrap(text, width=70, **kwargs): +def wrap(text, width=70, cjk=False, **kwargs): """Wrap a single paragraph of text, returning a list of wrapped lines. Reformat the single paragraph in 'text' so it fits in lines of no @@ -375,10 +389,10 @@ def wrap(text, width=70, **kwargs): space. See TextWrapper class for available keyword args to customize wrapping behaviour. """ - w = TextWrapper(width=width, **kwargs) + w = TextWrapper(width=width, cjk=cjk, **kwargs) return w.wrap(text) -def fill(text, width=70, **kwargs): +def fill(text, width=70, cjk=False, **kwargs): """Fill a single paragraph of text, returning a new string. Reformat the single paragraph in 'text' to fit in lines of no more @@ -387,10 +401,10 @@ def fill(text, width=70, **kwargs): whitespace characters converted to space. See TextWrapper class for available keyword args to customize wrapping behaviour. """ - w = TextWrapper(width=width, **kwargs) + w = TextWrapper(width=width, cjk=cjk, **kwargs) return w.fill(text) -def shorten(text, width, **kwargs): +def shorten(text, width, cjk=False, **kwargs): """Collapse and truncate the given text to fit in the given width. The text first has its whitespace collapsed. If it then fits in @@ -402,10 +416,41 @@ def shorten(text, width, **kwargs): >>> textwrap.shorten("Hello world!", width=11) 'Hello [...]' """ - w = TextWrapper(width=width, max_lines=1, **kwargs) + w = TextWrapper(width=width, cjk=cjk, max_lines=1, **kwargs) return w.fill(' '.join(text.strip().split())) +# -- CJK support ------------------------------------------------------ + +def cjkwide(char): + """Return True if char is Fullwidth or Wide, False otherwise. + Fullwidth and Wide CJK chars are double-width. + """ + return unicodedata.east_asian_width(char) in ('F', 'W') + + +def cjklen(text): + """Return the real width of text (its len if not a string). + """ + if not isinstance(text, str): + return len(text) + return sum(2 if cjkwide(char) else 1 for char in text) + + +def cjkslices(text, index): + """Return the two slices of text cut to the index. + """ + if not isinstance(text, str): + return text[:index], text[index:] + if cjklen(text) <= index: + return text, '' + i = 1 + # <= and i-1 to catch the last double length char of odd line + while cjklen(text[:i]) <= index: + i = i + 1 + return text[:i-1], text[i-1:] + + # -- Loosely related functionality ------------------------------------- _whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE) diff --git a/Misc/ACKS b/Misc/ACKS index 319128c9e9a4d4..127b0811307c2a 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -495,6 +495,7 @@ Lele Gaifax Santiago Gala Yitzchak Gale Matthew Gallagher +Florent Gallaire Quentin Gallet-Gilles Riccardo Attilio Galli Raymund Galvin From aa94f2635bb2273cae0287f89340ab2551680ee1 Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Tue, 14 Feb 2017 09:58:43 +0100 Subject: [PATCH 2/8] Fix TextWrapper positionnal arguments --- Lib/textwrap.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index fef5ce6c92ca1b..079d4313ce40aa 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -3,6 +3,7 @@ # Copyright (C) 1999-2001 Gregory P. Ward. # Copyright (C) 2002, 2003 Python Software Foundation. +# Copyright (C) 2015-2017 Florent Gallaire # Written by Greg Ward import re, unicodedata @@ -27,8 +28,6 @@ class TextWrapper: width (default: 70) the maximum width of wrapped lines (unless break_long_words is false) - cjk (default: False) - Handle double-width CJK chars. initial_indent (default: "") string that will be prepended to the first line of wrapped output. Counts towards the line's width. @@ -64,6 +63,8 @@ class TextWrapper: Truncate wrapped lines. placeholder (default: ' [...]') Append to the last line of truncated text. + cjk (default: false) + Handle double-width CJK chars. """ unicode_whitespace_trans = {} @@ -117,7 +118,6 @@ class TextWrapper: def __init__(self, width=70, - cjk=False, initial_indent="", subsequent_indent="", expand_tabs=True, @@ -129,9 +129,9 @@ def __init__(self, tabsize=8, *, max_lines=None, - placeholder=' [...]'): + placeholder=' [...]', + cjk=False): self.width = width - self.cjk = cjk self.initial_indent = initial_indent self.subsequent_indent = subsequent_indent self.expand_tabs = expand_tabs @@ -143,6 +143,7 @@ def __init__(self, self.tabsize = tabsize self.max_lines = max_lines self.placeholder = placeholder + self.cjk = cjk self.len = cjklen if self.cjk else len From 0264d9dd24cb9d33877d3ae40346076ed6bf0a20 Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Tue, 14 Feb 2017 11:26:16 +0100 Subject: [PATCH 3/8] Fix one import per line --- Lib/textwrap.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 079d4313ce40aa..628fce895bd5f0 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -6,7 +6,8 @@ # Copyright (C) 2015-2017 Florent Gallaire # Written by Greg Ward -import re, unicodedata +import re +import unicodedata __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten', 'cjkwide', 'cjklen', 'cjkslices'] From d630821bd37b929e910618c8913e14efaac1356e Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Tue, 14 Feb 2017 11:31:31 +0100 Subject: [PATCH 4/8] Rename self.len() in self._width() --- Lib/textwrap.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 628fce895bd5f0..1c1c196e024816 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -146,7 +146,7 @@ def __init__(self, self.placeholder = placeholder self.cjk = cjk - self.len = cjklen if self.cjk else len + self._width = cjklen if self.cjk else len # -- Private methods ----------------------------------------------- # (possibly useful for subclasses to override) @@ -259,7 +259,7 @@ def _wrap_chunks(self, chunks): lines = [] if self.width <= 0: raise ValueError("invalid width %r (must be > 0)" % self.width) - elif self.width == 1 and (sum(self.len(chunk) for chunk in chunks) > + elif self.width == 1 and (sum(self._width(chunk) for chunk in chunks) > sum(len(chunk) for chunk in chunks)): raise ValueError("invalid width 1 (must be > 1 when CJK chars)") if self.max_lines is not None: @@ -296,7 +296,7 @@ def _wrap_chunks(self, chunks): del chunks[-1] while chunks: - l = self.len(chunks[-1]) + l = self._width(chunks[-1]) # Can at least squeeze this chunk onto the current line. if cur_len + l <= width: @@ -309,7 +309,7 @@ def _wrap_chunks(self, chunks): # The current line is full, and the next chunk is too big to # fit on *any* line (not just this one). - if chunks and self.len(chunks[-1]) > width: + if chunks and self._width(chunks[-1]) > width: self._handle_long_word(chunks, cur_line, cur_len, width) cur_len = sum(map(len, cur_line)) From bfdfb22b7e4c33590a5aaaf82288cd7cf83c06bc Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Tue, 14 Feb 2017 11:37:29 +0100 Subject: [PATCH 5/8] Rename CJK functions with _ --- Lib/textwrap.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 1c1c196e024816..66fafc42f18401 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -10,7 +10,7 @@ import unicodedata __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten', - 'cjkwide', 'cjklen', 'cjkslices'] + 'cjk_wide', 'cjk_len', 'cjk_slices'] # Hardcode the recognized whitespace characters to the US-ASCII # whitespace characters. The main reason for doing this is that @@ -146,7 +146,7 @@ def __init__(self, self.placeholder = placeholder self.cjk = cjk - self._width = cjklen if self.cjk else len + self._width = cjk_len if self.cjk else len # -- Private methods ----------------------------------------------- # (possibly useful for subclasses to override) @@ -224,7 +224,7 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): # of the next chunk onto the current line as will fit. if self.break_long_words: if self.cjk: - chunk_start, chunk_end = cjkslices(reversed_chunks[-1], space_left) + chunk_start, chunk_end = cjk_slices(reversed_chunks[-1], space_left) cur_line.append(chunk_start) reversed_chunks[-1] = chunk_end else: @@ -424,31 +424,31 @@ def shorten(text, width, cjk=False, **kwargs): # -- CJK support ------------------------------------------------------ -def cjkwide(char): +def cjk_wide(char): """Return True if char is Fullwidth or Wide, False otherwise. Fullwidth and Wide CJK chars are double-width. """ return unicodedata.east_asian_width(char) in ('F', 'W') -def cjklen(text): +def cjk_len(text): """Return the real width of text (its len if not a string). """ if not isinstance(text, str): return len(text) - return sum(2 if cjkwide(char) else 1 for char in text) + return sum(2 if cjk_wide(char) else 1 for char in text) -def cjkslices(text, index): +def cjk_slices(text, index): """Return the two slices of text cut to the index. """ if not isinstance(text, str): return text[:index], text[index:] - if cjklen(text) <= index: + if cjk_len(text) <= index: return text, '' i = 1 # <= and i-1 to catch the last double length char of odd line - while cjklen(text[:i]) <= index: + while cjk_len(text[:i]) <= index: i = i + 1 return text[:i-1], text[i-1:] From 8337ce50e71ba46a4dcc473d9634a5b9ddda7dca Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Wed, 15 Feb 2017 00:34:46 +0100 Subject: [PATCH 6/8] Improve cjk_slices() complexity from O(n^2) to O(n) (Thanks to INADA Naoki) --- Lib/textwrap.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 66fafc42f18401..559896d6aad48d 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -446,11 +446,12 @@ def cjk_slices(text, index): return text[:index], text[index:] if cjk_len(text) <= index: return text, '' - i = 1 - # <= and i-1 to catch the last double length char of odd line - while cjk_len(text[:i]) <= index: - i = i + 1 - return text[:i-1], text[i-1:] + width = 0 + for i, char in enumerate(text): + width = width + cjk_wide(char) + 1 + if width > index: + break + return text[:i], text[i:] # -- Loosely related functionality ------------------------------------- From cb9812bada4b96806873b8818757828fe6985d58 Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Wed, 15 Feb 2017 02:04:04 +0100 Subject: [PATCH 7/8] Add Doc for new CJK option and functions --- Doc/library/textwrap.rst | 29 +++++++++++++++++++++++++++++ Lib/textwrap.py | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/Doc/library/textwrap.rst b/Doc/library/textwrap.rst index 438007d0028d86..bbb87ed14ef8aa 100644 --- a/Doc/library/textwrap.rst +++ b/Doc/library/textwrap.rst @@ -117,6 +117,28 @@ functions should be good enough; otherwise, you should use an instance of .. versionadded:: 3.3 +.. function:: cjk_wide(char) + + Return ``True`` if *char* is Fullwidth or Wide, ``False`` otherwise. + Fullwidth and Wide CJK chars are double-width. + + .. versionadded:: 3.7 + + +.. function:: cjk_len(text) + + Return the real width of *text* (its len if not a string). + + .. versionadded:: 3.7 + + +.. function:: cjk_slices(text, index) + + Return the two slices of *text* cut to *index*. + + .. versionadded:: 3.7 + + :func:`wrap`, :func:`fill` and :func:`shorten` work by creating a :class:`TextWrapper` instance and calling a single method on it. That instance is not reused, so for applications that process many text @@ -276,6 +298,13 @@ hyphenated words; only then will long words be broken if necessary, unless .. versionadded:: 3.4 + .. attribute:: cjk + + (default: ``False``) Handle double-width CJK chars. + + .. versionadded:: 3.7 + + :class:`TextWrapper` also provides some public methods, analogous to the module-level convenience functions: diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 559896d6aad48d..2ad2a4f3b4b69b 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -440,7 +440,7 @@ def cjk_len(text): def cjk_slices(text, index): - """Return the two slices of text cut to the index. + """Return the two slices of text cut to index. """ if not isinstance(text, str): return text[:index], text[index:] From 54de7aa6c6fffe9b2248153051a24b0e658bf665 Mon Sep 17 00:00:00 2001 From: Florent Gallaire Date: Wed, 15 Feb 2017 03:25:48 +0100 Subject: [PATCH 8/8] Fix Python build problems --- Lib/idlelib/idle_test/test_calltips.py | 2 +- Lib/textwrap.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/idlelib/idle_test/test_calltips.py b/Lib/idlelib/idle_test/test_calltips.py index 0b11602ca9e414..1d06e0d0b5b578 100644 --- a/Lib/idlelib/idle_test/test_calltips.py +++ b/Lib/idlelib/idle_test/test_calltips.py @@ -72,7 +72,7 @@ def test_signature_wrap(self): (width=70, initial_indent='', subsequent_indent='', expand_tabs=True, replace_whitespace=True, fix_sentence_endings=False, break_long_words=True, drop_whitespace=True, break_on_hyphens=True, tabsize=8, *, max_lines=None, - placeholder=' [...]')''') + placeholder=' [...]', cjk=False)''') def test_docline_truncation(self): def f(): pass diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 2ad2a4f3b4b69b..1c6146abdee03d 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -7,7 +7,6 @@ # Written by Greg Ward import re -import unicodedata __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten', 'cjk_wide', 'cjk_len', 'cjk_slices'] @@ -428,6 +427,7 @@ def cjk_wide(char): """Return True if char is Fullwidth or Wide, False otherwise. Fullwidth and Wide CJK chars are double-width. """ + import unicodedata return unicodedata.east_asian_width(char) in ('F', 'W') pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy