-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathpc.py
106 lines (106 loc) · 4.34 KB
/
pc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import re,urllib,threading
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.flag = 0 # lable flag
self.content_flag= 0 #content flag
self.links = []
self.title=""
self.img=[]
self.content=[]
self.linkcontent=[]
def handle_starttag(self, tag, attrs):
#print "Encountered the beginning of a %s tag" % tag
if tag == "a":
if len(attrs) == 0: pass
else:
for (variable, value) in attrs:
if variable == "href":
self.links.append(value)
self.flag=4
if tag == "ul":
if len(attrs)==0:pass
else:
for(varviable,value) in attrs:
if varviable=="class" and (value=="info" or value=="description"):
self.flag=3
self.content_flag=1
if tag == "div":
if len(attrs)==0:pass
else:
for(varviable,value) in attrs:
if varviable=="class":
if value=="content" or value=="author" or value=="conleft":
self.flag=3
self.content_flag=1
if tag=="meta":
if len(attrs)==0:pass
else:
for (variable,value) in attrs:
if variable=="content":
pass#self.title=value
if tag=="title":
self.flag=1
if tag=="img":
if len(attrs)==0:pass
else:
# img_host=u'http://pic.58.com/'
for(variable,value) in attrs:
if variable=="src":
self.img.append(value)
if tag=="p":
if self.flag==3:
self.flag=2
if tag=="br":
if self.content_flag==1:
self.flag=2
def handle_endtag(self, tag):
if tag=="p":
if self.content_flag==1:
self.content_flag=0
def handle_data(self,data):
if self.flag==1:
self.title=data
self.flag=0
if self.flag==3:
self.content.append(data)
self.flag=0
if self.flag==4:
self.linkcontent.append(data)
self.flag=0
def getimageurl(img):
reg='^http:\/\/\w{3}\.\w{9}\.com\/\w{6}\/\w{8}.*?'
imageurl = []
for i in range(len(img)):
list = re.compile(reg).findall(img[i])
# print len(list)
if len(list) == 0:
continue
imageurl.append(img[i])
return imageurl
url = ['http://www.qiushibaike.com/imgrank',]
def getlist(url):
lParser = MyHTMLParser()
lParser.feed(urllib.urlopen(url).read().decode('utf-8'))
lParser.close()
# print lParser.linkcontent[10]
# print lParser.img
imageurllist = getimageurl(lParser.img)
num = range(len(imageurllist))
for a in num:
urllib.urlretrieve(imageurllist[a],'E://images/b%d.jpg' % (a))
# for list in lParser.img:
# print list
def main():
threads=[]
nloops=range(len(url))
for i in nloops:
t = threading.Thread(target=getlist,args=[url[i],])
threads.append(t)
for i in nloops:
threads[i].start()
for i in nloops:
threads[i].join()
if __name__ == '__main__':
main()