-
Notifications
You must be signed in to change notification settings - Fork 0
/
dowload_book.py
41 lines (34 loc) · 1.19 KB
/
dowload_book.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import re
import urllib2
def extract_links(html):
blocks = re.findall(r'<ul class=l13>.*?</ul>', html, re.S)
links = []
for b in blocks:
links += re.findall(r'<a href=(\S+)[^<>]*>([^<>]*)</a>', b)
return links
def extract_content(html):
m = re.search('<!--\xd5\xfd\xce\xc4\xc4\xda\xc8\xdd\xbf\xaa\xca\xbc-->.*<!--\xd5\xfd\xce\xc4\xc4\xda\xc8\xdd\xbd\xe1\xca\xf8-->', html, re.S)
return m and html_to_text(m.group()) or ''
def html_to_text(html):
html = re.sub(r'<p>(.*?)</p>', r'\1\n', html)
html = re.sub(r'<[^<>]*>', '', html)
return "\n\n" + html.strip() + "\n\n"
def url_get(url):
u = urllib2.urlopen(url)
c = u.read()
u.close()
return c
def download_book(urlindex, filename):
links = extract_links(url_get(urlindex))
fp = open(filename, 'w')
for link in links:
u = 'http://book.sina.com.cn' + link[0]
title = link[1]
fp.write(title)
fp.write(extract_content(url_get(u)))
#print u
# print title
fp.close()
# 使用例子,下载并合成一个单独的 txt
if __name__ == '__main__':
download_book('http://book.sina.com.cn/nzt/novel/lit/wxdfd/index.shtml', '杜拉拉升职记.txt')