-
Notifications
You must be signed in to change notification settings - Fork 0
/
zhuaqu.py
100 lines (80 loc) · 3.04 KB
/
zhuaqu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import html.parser as HTMLparser;
import urllib.request
import sys
import time
import json
import random
from bs4 import BeautifulSoup
import re
file_object1 = open('C://Users/Administrator/Desktop/zz91.xlsx',"w",encoding="utf-8")
lo=[];
class MyHTMLParser(HTMLparser.HTMLParser):
a_t=False
def handle_starttag(self, tag, attrs):
#print("开始一个标签:",tag)
if str(tag).startswith("div"):
for attr in attrs:
if attr[1]=="b3-item-title":
self.a_t=True
if self.a_t==True:
if str(tag).startswith("a"):
lo.append(attrs[0][1])
def handle_endtag(self, tag):
if tag == "div":
self.a_t=False
#print("结束一个标签:",tag)
def handle_data(self, data):
if self.a_t is True:
if (data!="" and data!="\n"):
file_object1.write("得到的数据: "+data+"\n")
p=MyHTMLParser()
data = urllib.request.urlopen("http://www.zz91.com/").read().decode('UTF-8');
p.feed(data)
p.close()
class MyHTMLParser1(HTMLparser.HTMLParser):
a_t=False
def handle_starttag(self, tag, attrs):
#print("开始一个标签:",tag)
if str(tag).startswith("div"):
for attr in attrs:
if attr[1]=="zi4_top":
self.a_t=True
if str(tag).startswith("div"):
for attr in attrs:
if attr[1]=="zi6_top":
self.a_t=True
# file_object1.write(" 属性值:"+attrs[0][1]+"\n")
if str(tag).startswith("input"):
if attrs[1][1] == "company_id":
if attrs[2][0]=="value":
data2 = urllib.request.urlopen("http://trade.zz91.com/trade/companyinfos.htm?company_id="+attrs[2][1]).read().decode('UTF-8');
strjson = data2[data2.find("{"):-1]
strjson= strjson.replace("\\"," ")
jsondata=json.loads(strjson)
print(jsondata["result"])
stgr = re.findall( r">([^<>]*)<+",jsondata["result"])
print(stgr)
file_object1.write("\n")
for stgrr in stgr:
if stgrr==stgr[-1]:
file_object1.write(stgrr+"\n\n")
break;
if stgrr!="":
file_object1.write(stgrr+"\t")
def handle_endtag(self, tag):
if tag == "div":
self.a_t=False
#print("结束一个标签:",tag)
def handle_data(self, data):
if self.a_t is True:
if data!="" and data!="\n":
file_object1.write((data)+"\t")
pages=["1","2","3","4","5","6","7","8","9","10"]
p1=MyHTMLParser1()
for l in lo:
for page in pages:
print(l+"?page="+page)
data1 = urllib.request.urlopen(l+"?page="+page).read().decode('UTF-8');
p1.feed(data1)
p1.close()
file_object1.close()