diff --git a/html2text.py b/html2text.py index 2f650ab5..29fea4d7 100755 --- a/html2text.py +++ b/html2text.py @@ -30,7 +30,7 @@ def has_key(x, y): import urllib.request as urllib except: import urllib -import optparse, re, sys, codecs, types +import optparse, re, sys, codecs, types, cgi try: from textwrap import wrap except: pass @@ -266,16 +266,25 @@ def close(self): if self.unicode_snob: nbsp = unichr(name2cp('nbsp')) else: - nbsp = u' ' + nbsp = u' ' self.outtext = self.outtext.replace(u' _place_holder;', nbsp) return self.outtext def handle_charref(self, c): - self.o(self.charref(c), 1) + charref = self.charref(c) + if not self.code and not self.pre: + charref = cgi.escape(charref) + self.o(charref, 1) def handle_entityref(self, c): - self.o(self.entityref(c), 1) + entityref = self.entityref(c) + if not self.code and not self.pre and entityref != ' _place_holder;': + entityref = cgi.escape(entityref) + if (self.code or self.pre) and entityref == ' _place_holder;': + #   doesn't work in `` and indented blocks + entityref = unichr(name2cp('nbsp')) + self.o(entityref, 1) def handle_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) @@ -453,7 +462,10 @@ def handle_tag(self, tag, attrs, start): # handle some font attributes, but leave headers clean self.handle_emphasis(start, tag_style, parent_style) - if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` `` + if tag in ["code", "tt"] and not self.pre: + # TODO: `` `this` `` + self.o('`') + self.code = not self.code if tag == "abbr": if start: self.abbr_title = None diff --git a/test/GoogleDocMassDownload.md b/test/GoogleDocMassDownload.md index bdd82885..b4dcdeab 100644 --- a/test/GoogleDocMassDownload.md +++ b/test/GoogleDocMassDownload.md @@ -13,16 +13,16 @@ text to separate lists 1. now with numbers 2. the prisoner 1. not an _italic number_ - 2. a **bold human** being + 2. a **bold human**  being 3. end **bold** _italic_ ` def func(x):` -` if x < 1:` -` return 'a'` -` return 'b'` +`   if x < 1:` +`     return 'a'` +`   return 'b'` -Some ` fixed width text` here +Some ` fixed width text`  here _` italic fixed width text`_ diff --git a/test/GoogleDocSaved.md b/test/GoogleDocSaved.md index bdd82885..b4dcdeab 100644 --- a/test/GoogleDocSaved.md +++ b/test/GoogleDocSaved.md @@ -13,16 +13,16 @@ text to separate lists 1. now with numbers 2. the prisoner 1. not an _italic number_ - 2. a **bold human** being + 2. a **bold human**  being 3. end **bold** _italic_ ` def func(x):` -` if x < 1:` -` return 'a'` -` return 'b'` +`   if x < 1:` +`     return 'a'` +`   return 'b'` -Some ` fixed width text` here +Some ` fixed width text`  here _` italic fixed width text`_ diff --git a/test/nbsp.html b/test/nbsp.html index 9cab9015..356d9712 100644 --- a/test/nbsp.html +++ b/test/nbsp.html @@ -5,7 +5,7 @@

NBSP handling test #1

-

In this test all NBSPs will be replaced with ordinary spaces (unicode_snob = False).

+

In this test all NBSP entities will be preserved (unicode_snob = False).

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, @@ -17,4 +17,3 @@

NBSP handling test #1

proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

- diff --git a/test/nbsp.md b/test/nbsp.md index 16c9ee38..716fe5d7 100644 --- a/test/nbsp.md +++ b/test/nbsp.md @@ -1,14 +1,14 @@ # NBSP handling test #1 -In this test all NBSPs will be replaced with ordinary spaces (unicode_snob = -False). +In this test all NBSP entities will be preserved (unicode_snob = False). -Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod -tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, -quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo -consequat. +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim +ad minim veniam, quis nostrud exercitation ullamco laboris nisi +ut aliquip ex ea commodo consequat. -Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore -eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt -in culpa qui officia deserunt mollit anim id est laborum. +Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat +non proident, sunt in culpa qui officia deserunt mollit anim id est +laborum. diff --git a/test/normal.html b/test/normal.html index 47ef480e..b8a472ec 100644 --- a/test/normal.html +++ b/test/normal.html @@ -136,5 +136,21 @@

c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#

+ + c:\tmp, \\server\path, \_/, foo\bar, #\#, \\# + +

+ A common entity is &copy;
+ 3 < 6 && "z" > "a" +

+ +

+ foo   bar +

+ +
foo   bar
+ + foo   bar + diff --git a/test/normal.md b/test/normal.md index d63b403a..503f71ba 100644 --- a/test/normal.md +++ b/test/normal.md @@ -52,3 +52,15 @@ not a hr c:\tmp, \\\server\path, \\_/, foo\bar, #\\#, \\\\# +`c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#` + +A common entity is &copy; +3 < 6 && "z" > "a" + +foo   bar + + + foo   bar + +`foo   bar` + diff --git a/test/normal_escape_snob.html b/test/normal_escape_snob.html index 0d21867a..da90792b 100644 --- a/test/normal_escape_snob.html +++ b/test/normal_escape_snob.html @@ -133,9 +133,14 @@


- - -

- +

c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#

+ +

+ A common entity is &copy;
+ 3 < 6 && "z" > "a" +

diff --git a/test/normal_escape_snob.md b/test/normal_escape_snob.md index 1260a0a8..fe4c7c9d 100644 --- a/test/normal_escape_snob.md +++ b/test/normal_escape_snob.md @@ -53,3 +53,6 @@ not a hr c:\tmp, \\\server\path, \\\_/, foo\bar, \#\\\#, \\\\\# +A common entity is &copy; +3 < 6 && "z" > "a" + diff --git a/test/run_tests.py b/test/run_tests.py index 7ebfd394..1cecc4df 100644 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -43,7 +43,7 @@ def test_command(fn, *args): cmd += [fn] result = get_baseline(fn) - actual = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout.read() + actual = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout.read().decode('utf-8') if os.name == 'nt': # Fix the unwanted CR to CRCRLF replacement