forked from Python3Spiders/LianJiaSpider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlianjia_crawler.py
196 lines (152 loc) · 5.83 KB
/
lianjia_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# -*- coding: utf-8 -*-
# author: inspurer(月小水长)
# pc_type lenovo
# create_date: 2019/2/27
# file_name: lianjia_crawler.py
# github https://github.com/inspurer
# qq_mail [email protected]
import requests
from concurrent.futures import ThreadPoolExecutor
from pyquery import PyQuery as pq
import json
import threading
import time
def get_list_page_url(city):
start_url = "https://{}.lianjia.com/ershoufang".format(city)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}
try:
response = requests.get(start_url, headers=headers)
# print(response.status_code, response.text)
doc = pq(response.text)
total_num = int(doc(".resultDes .total span").text())
total_page = total_num // 30 + 1
# 只能访问到前一百页
if total_page > 100:
total_page = 100
page_url_list = list()
for i in range(total_page):
url = start_url + "/pg" + str(i + 1) + "/"
page_url_list.append(url)
#print(url)
return page_url_list
except:
print("获取总套数出错,请确认起始URL是否正确")
return None
detail_list = list()
# 需要先在本地开启代理池
# 代理池仓库: https://github.com/Python3WebSpider/ProxyPool
def get_valid_ip():
url = "http://localhost:5000/get"
try:
ip = requests.get(url).text
return ip
except:
print("请先运行代理池")
def get_detail_page_url(page_url):
global detail_list
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Referer': 'https://bj.lianjia.com/ershoufang'
}
try:
response = requests.get(page_url,headers=headers,timeout=3)
doc = pq(response.text)
# broswer.get(page_url)
# print(page_url)
# doc = pq(broswer.page_source)
i = 0
detail_urls = list()
for item in doc(".sellListContent li").items():
i += 1
if i == 31:
break
child_item = item(".noresultRecommend")
if child_item == None:
i -= 1
detail_url = child_item.attr("href")
detail_urls.append(detail_url)
return detail_urls
except:
print("获取列表页" + page_url + "出错")
lock = threading.Lock()
def detail_page_parser(res):
global detail_list
detail_urls = res.result()
if not detail_urls:
print("detail url 为空")
return None
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Referer': 'https://bj.lianjia.com/ershoufang'
}
for detail_url in detail_urls:
try:
response = requests.get(url=detail_url, headers=headers,timeout=3)
#print(response.status_code)
detail_dict = dict()
doc = pq(response.text)
unit_price = doc(".unitPriceValue").text()
unit_price = unit_price[0:unit_price.index("元")]
title = doc("h1").text()
area = doc(".areaName .info a").eq(0).text().strip()
url = detail_url
detail_dict["title"] = title
detail_dict["area"] = area
detail_dict["price"] = unit_price
detail_dict["url"] = url
detail_list.append(detail_dict)
print(unit_price, title, area)
except:
print("获取详情页出错,换ip重试")
proxies = {
"http": "http://" + get_valid_ip(),
}
try:
response = requests.get(url=detail_url, headers=headers, proxies=proxies)
#print(response.status_code)
detail_dict = dict()
doc = pq(response.text)
unit_price = doc(".unitPriceValue").text()
unit_price = unit_price[0:unit_price.index("元")]
title = doc("h1").text()
area = doc(".areaName .info a").eq(0).text().strip()
url = detail_url
# 已下架的还会爬取,但是没有价格
if len(unit_price)>0:
detail_dict["title"] = title
detail_dict["area"] = area
detail_dict["price"] = unit_price
detail_dict["url"] = url
detail_list.append(detail_dict)
print(unit_price, title, area)
except:
print("重试失败...")
def save_data(data,filename):
with open(filename+".json", 'w', encoding="utf-8") as f:
f.write(json.dumps(data, indent=2, ensure_ascii=False))
def main():
# cq,cs,nj,dl,wh,cc
city_list = ['nj']
for city in city_list:
page_url_list = get_list_page_url(city)
# pool = threadpool.ThreadPool(20)
# requests = threadpool.makeRequests(page_and_detail_parser, page_url_list)
# [pool.putRequest(req) for req in requests]
# pool.wait()
p = ThreadPoolExecutor(30)
for page_url in page_url_list:
p.submit(get_detail_page_url, page_url).add_done_callback(detail_page_parser)
# 这里的回调函数拿到的是一个对象。
# 先把返回的res得到一个结果。即在前面加上一个res.result(),这个结果就是get_detail_page_url的返回
p.shutdown()
print(detail_list)
save_data(detail_list, city)
detail_list.clear()
if __name__ == '__main__':
old = time.time()
main()
new = time.time()
delta_time = new - old
print("程序共运行{}s".format(delta_time))