Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions html2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def has_key(x, y):
import urllib.request as urllib
except:
import urllib
import optparse, re, sys, codecs, types
import optparse, re, sys, codecs, types, cgi

try: from textwrap import wrap
except: pass
Expand Down Expand Up @@ -266,16 +266,25 @@ def close(self):
if self.unicode_snob:
nbsp = unichr(name2cp('nbsp'))
else:
nbsp = u' '
nbsp = u' '
self.outtext = self.outtext.replace(u'&nbsp_place_holder;', nbsp)

return self.outtext

def handle_charref(self, c):
self.o(self.charref(c), 1)
charref = self.charref(c)
if not self.code and not self.pre:
charref = cgi.escape(charref)
self.o(charref, 1)

def handle_entityref(self, c):
self.o(self.entityref(c), 1)
entityref = self.entityref(c)
if not self.code and not self.pre and entityref != '&nbsp_place_holder;':
entityref = cgi.escape(entityref)
if (self.code or self.pre) and entityref == '&nbsp_place_holder;':
#   doesn't work in `` and indented blocks
entityref = unichr(name2cp('nbsp'))
self.o(entityref, 1)

def handle_starttag(self, tag, attrs):
self.handle_tag(tag, attrs, 1)
Expand Down Expand Up @@ -453,7 +462,10 @@ def handle_tag(self, tag, attrs, start):
# handle some font attributes, but leave headers clean
self.handle_emphasis(start, tag_style, parent_style)

if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` ``
if tag in ["code", "tt"] and not self.pre:
# TODO: `` `this` ``
self.o('`')
self.code = not self.code
if tag == "abbr":
if start:
self.abbr_title = None
Expand Down
10 changes: 5 additions & 5 deletions test/GoogleDocMassDownload.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,16 @@ text to separate lists
1. now with numbers
2. the prisoner
1. not an _italic number_
2. a **bold human** being
2. a **bold human**  being
3. end

**bold**
_italic_

` def func(x):`
` if x < 1:`
` return 'a'`
` return 'b'`
`   if x < 1:`
`     return 'a'`
`   return 'b'`

Some ` fixed width text` here
Some ` fixed width text` &nbsp;here
_` italic fixed width text`_
10 changes: 5 additions & 5 deletions test/GoogleDocSaved.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,16 @@ text to separate lists
1. now with numbers
2. the prisoner
1. not an _italic number_
2. a **bold human** being
2. a **bold human** &nbsp;being
3. end

**bold**
_italic_

` def func(x):`
` if x < 1:`
` return 'a'`
` return 'b'`
`   if x < 1:`
`     return 'a'`
`   return 'b'`

Some ` fixed width text` here
Some ` fixed width text` &nbsp;here
_` italic fixed width text`_
3 changes: 1 addition & 2 deletions test/nbsp.html
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<body>
<h1>NBSP handling test #1</h1>

<p>In this test all NBSPs will be replaced with ordinary spaces (unicode_snob = False).</p>
<p>In this test all NBSP entities will be preserved (unicode_snob = False).</p>

<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do&nbsp;eiusmod
tempor incididunt ut&nbsp;labore et&nbsp;dolore magna aliqua. Ut&nbsp;enim ad&nbsp;minim veniam,
Expand All @@ -17,4 +17,3 @@ <h1>NBSP handling test #1</h1>
proident, sunt in&nbsp;culpa qui officia deserunt mollit anim id&nbsp;est laborum.</p>
</body>
</html>

18 changes: 9 additions & 9 deletions test/nbsp.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# NBSP handling test #1

In this test all NBSPs will be replaced with ordinary spaces (unicode_snob =
False).
In this test all NBSP entities will be preserved (unicode_snob = False).

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat.
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do&nbsp;eiusmod
tempor incididunt ut&nbsp;labore et&nbsp;dolore magna aliqua. Ut&nbsp;enim
ad&nbsp;minim veniam, quis nostrud exercitation ullamco laboris nisi
ut&nbsp;aliquip ex&nbsp;ea commodo consequat.

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore
eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt
in culpa qui officia deserunt mollit anim id est laborum.
Duis aute irure dolor in&nbsp;reprehenderit in&nbsp;voluptate velit esse
cillum dolore eu&nbsp;fugiat nulla pariatur. Excepteur sint occaecat cupidatat
non proident, sunt in&nbsp;culpa qui officia deserunt mollit anim id&nbsp;est
laborum.

16 changes: 16 additions & 0 deletions test/normal.html
Original file line number Diff line number Diff line change
Expand Up @@ -136,5 +136,21 @@ <h1>
<p>
c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#
</p>

<code>c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#</code>

<p>
A common entity is &amp;copy;<br>
3 &lt; 6 &amp;&amp; "z" &#62; "a&quot;
</p>

<p>
foo&nbsp;&nbsp;&nbsp;bar
</p>

<pre>foo&nbsp;&nbsp;&nbsp;bar</pre>

<code>foo&nbsp;&nbsp;&nbsp;bar</code>

</body>
</html>
12 changes: 12 additions & 0 deletions test/normal.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,15 @@ not a hr

c:\tmp, \\\server\path, \\_/, foo\bar, #\\#, \\\\#

`c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#`

A common entity is &amp;copy;
3 &lt; 6 &amp;&amp; "z" &gt; "a"

foo&nbsp;&nbsp;&nbsp;bar


foo   bar

`foo   bar`

7 changes: 6 additions & 1 deletion test/normal_escape_snob.html
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,14 @@ <h1>
<br>
- - -
</p>

<p>
c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#
</p>

<p>
A common entity is &amp;copy;<br>
3 &lt; 6 &amp;&amp; "z" &#62; "a&quot;
</p>
</body>
</html>
3 changes: 3 additions & 0 deletions test/normal_escape_snob.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,6 @@ not a hr

c:\tmp, \\\server\path, \\\_/, foo\bar, \#\\\#, \\\\\#

A common entity is &amp;copy;
3 &lt; 6 &amp;&amp; "z" &gt; "a"

2 changes: 1 addition & 1 deletion test/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_command(fn, *args):
cmd += [fn]

result = get_baseline(fn)
actual = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout.read()
actual = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout.read().decode('utf-8')

if os.name == 'nt':
# Fix the unwanted CR to CRCRLF replacement
Expand Down