diff --git a/html2text.py b/html2text.py index 2f650ab5..29fea4d7 100755 --- a/html2text.py +++ b/html2text.py @@ -30,7 +30,7 @@ def has_key(x, y): import urllib.request as urllib except: import urllib -import optparse, re, sys, codecs, types +import optparse, re, sys, codecs, types, cgi try: from textwrap import wrap except: pass @@ -266,16 +266,25 @@ def close(self): if self.unicode_snob: nbsp = unichr(name2cp('nbsp')) else: - nbsp = u' ' + nbsp = u' ' self.outtext = self.outtext.replace(u' _place_holder;', nbsp) return self.outtext def handle_charref(self, c): - self.o(self.charref(c), 1) + charref = self.charref(c) + if not self.code and not self.pre: + charref = cgi.escape(charref) + self.o(charref, 1) def handle_entityref(self, c): - self.o(self.entityref(c), 1) + entityref = self.entityref(c) + if not self.code and not self.pre and entityref != ' _place_holder;': + entityref = cgi.escape(entityref) + if (self.code or self.pre) and entityref == ' _place_holder;': + # doesn't work in `` and indented blocks + entityref = unichr(name2cp('nbsp')) + self.o(entityref, 1) def handle_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) @@ -453,7 +462,10 @@ def handle_tag(self, tag, attrs, start): # handle some font attributes, but leave headers clean self.handle_emphasis(start, tag_style, parent_style) - if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` `` + if tag in ["code", "tt"] and not self.pre: + # TODO: `` `this` `` + self.o('`') + self.code = not self.code if tag == "abbr": if start: self.abbr_title = None diff --git a/test/GoogleDocMassDownload.md b/test/GoogleDocMassDownload.md index bdd82885..b4dcdeab 100644 --- a/test/GoogleDocMassDownload.md +++ b/test/GoogleDocMassDownload.md @@ -13,16 +13,16 @@ text to separate lists 1. now with numbers 2. the prisoner 1. not an _italic number_ - 2. a **bold human** being + 2. a **bold human** being 3. end **bold** _italic_ ` def func(x):` -` if x < 1:` -` return 'a'` -` return 'b'` +` if x < 1:` +` return 'a'` +` return 'b'` -Some ` fixed width text` here +Some ` fixed width text` here _` italic fixed width text`_ diff --git a/test/GoogleDocSaved.md b/test/GoogleDocSaved.md index bdd82885..b4dcdeab 100644 --- a/test/GoogleDocSaved.md +++ b/test/GoogleDocSaved.md @@ -13,16 +13,16 @@ text to separate lists 1. now with numbers 2. the prisoner 1. not an _italic number_ - 2. a **bold human** being + 2. a **bold human** being 3. end **bold** _italic_ ` def func(x):` -` if x < 1:` -` return 'a'` -` return 'b'` +` if x < 1:` +` return 'a'` +` return 'b'` -Some ` fixed width text` here +Some ` fixed width text` here _` italic fixed width text`_ diff --git a/test/nbsp.html b/test/nbsp.html index 9cab9015..356d9712 100644 --- a/test/nbsp.html +++ b/test/nbsp.html @@ -5,7 +5,7 @@
In this test all NBSPs will be replaced with ordinary spaces (unicode_snob = False).
+In this test all NBSP entities will be preserved (unicode_snob = False).
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, @@ -17,4 +17,3 @@