Skip to content

Commit 17377e0

Browse files
author
antoine.pitrou
committed
Issue #4163: Use unicode-friendly word splitting in the textwrap functions when given an unicode string.
git-svn-id: http://svn.python.org/projects/python/trunk@67746 6015fed2-1504-0410-9fe1-9d1591cc4771
1 parent e86f81b commit 17377e0

3 files changed

Lines changed: 20 additions & 7 deletions

File tree

Lib/test/test_textwrap.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def test_hyphenated_numbers(self):
174174
text = ("Python 1.0.0 was released on 1994-01-26. Python 1.0.1 was\n"
175175
"released on 1994-02-15.")
176176

177-
self.check_wrap(text, 30, ['Python 1.0.0 was released on',
177+
self.check_wrap(text, 35, ['Python 1.0.0 was released on',
178178
'1994-01-26. Python 1.0.1 was',
179179
'released on 1994-02-15.'])
180180
self.check_wrap(text, 40, ['Python 1.0.0 was released on 1994-01-26.',
@@ -353,6 +353,14 @@ def test_unicode(self):
353353
otext = self.wrapper.fill(text)
354354
assert isinstance(otext, unicode)
355355

356+
def test_no_split_at_umlaut(self):
357+
text = u"Die Empf\xe4nger-Auswahl"
358+
self.check_wrap(text, 13, [u"Die", u"Empf\xe4nger-", u"Auswahl"])
359+
360+
def test_umlaut_followed_by_dash(self):
361+
text = u"aa \xe4\xe4-\xe4\xe4"
362+
self.check_wrap(text, 7, [u"aa \xe4\xe4-", u"\xe4\xe4"])
363+
356364
def test_split(self):
357365
# Ensure that the standard _split() method works as advertised
358366
# in the comments

Lib/textwrap.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,16 +84,16 @@ class TextWrapper:
8484
# splits into
8585
# Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
8686
# (after stripping out empty strings).
87-
wordsep_re = re.compile(
87+
wordsep_re = (
8888
r'(\s+|' # any whitespace
89-
r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words
89+
r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words
9090
r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash
9191

9292
# This less funky little regex just split on recognized spaces. E.g.
9393
# "Hello there -- you goof-ball, use the -b option!"
9494
# splits into
9595
# Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
96-
wordsep_simple_re = re.compile(r'(\s+)')
96+
wordsep_simple_re = r'(\s+)'
9797

9898
# XXX this is not locale- or charset-aware -- string.lowercase
9999
# is US-ASCII only (and therefore English-only)
@@ -160,10 +160,12 @@ def _split(self, text):
160160
'use', ' ', 'the', ' ', '-b', ' ', option!'
161161
otherwise.
162162
"""
163-
if self.break_on_hyphens is True:
164-
chunks = self.wordsep_re.split(text)
163+
flags = re.UNICODE if isinstance(text, unicode) else 0
164+
if self.break_on_hyphens:
165+
pat = self.wordsep_re
165166
else:
166-
chunks = self.wordsep_simple_re.split(text)
167+
pat = self.wordsep_simple_re
168+
chunks = re.compile(pat, flags).split(text)
167169
chunks = filter(None, chunks) # remove empty chunks
168170
return chunks
169171

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ Core and Builtins
7474
Library
7575
-------
7676

77+
- Issue #4163: Use unicode-friendly word splitting in the textwrap functions
78+
when given an unicode string.
79+
7780
- Issue #4616: TarFile.utime(): Restore directory times on Windows.
7881

7982
- Issue #4084: Fix max, min, max_mag and min_mag Decimal methods to

0 commit comments

Comments
 (0)