forked from PengWeihb/toutiaoSpider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_toutiao_comment.py
236 lines (218 loc) · 8.63 KB
/
get_toutiao_comment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# -*- coding:utf-8 -*-
import requests
import json
from urllib.parse import urlencode
import time
import sys
import redis
from threading import Thread
import hashlib
from fake_useragent import UserAgent
import random
'''
今日头条文章评论内容抓取,抓取一级评论及二级回复评论
'''
#设置递归深度
sys.setrecursionlimit(100000)
class Comment(object):
def __init__(self):
self.offset = 0
self.count = 2 #IP失效次数()
self.redis_cli = redis.Redis(host='secret', port=6379, db=0, password='secret', charset='utf8', decode_responses=True)
def get_comment(self, item_id, group_id, save_time):
ua = UserAgent()
ts = int(time.time())
param_data = {
'offset': self.offset,
'group_id': group_id,
'aggr_type': 1,
'count': 50,
'item_id': item_id,
'ts': ts
}
comment_url = 'http://is-hl.snssdk.com/article/v4/tab_comments/?' + urlencode(param_data)
headers = {
'Host':'is-hl.snssdk.com',
'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 6.0.1; SM-A8000 Build/MMB29M) NewsArticle/7.0.3 cronet/TTNetVersion:a729d5c3',
}
ip = self.redis_cli.srandmember('IP')
print('请求IP:', ip)
if ip == None:
print('ip is None')
time.sleep(10)
ip = self.redis_cli.srandmember('IP')
proxies = {
"http": "http://{}".format(ip),
}
time.sleep(random.randint(1, 2) / 4)
try:
response = requests.get(comment_url, headers=headers, proxies=proxies, timeout=3)
total_number = response.json()['total_number']
data = response.json()['data']
time.sleep(random.randint(1, 2) / 32)
if data == []:
now = int(time.time())
space_time = now - save_time
if space_time < 43200: #12小时的时间戳
item = {'item_id':item_id,'group_id':group_id,'save_time':save_time}
self.redis_cli.sadd('spider_toutiao_comment_id',str(item))
print('insert item success!!!')
return
self.parse_comment(data,item_id,group_id,total_number)
except:
try:
now = int(time.time())
space_time = now - save_time
if space_time < 43200: # 12小时的时间戳
item = {'item_id': item_id, 'group_id': group_id, 'save_time': save_time}
self.redis_cli.sadd('spider_toutiao_comment_id', str(item))
print('insert item success!!!')
except Exception as e:
print('insert item wrong', e)
self.redis_cli.srem("IP", ip)
def parse_comment(self,comments,item_id,group_id,total_number):
for comment in comments:
# 当前请求Unix时间戳
mt = int(time.time())
# API签名字符串
para = 'hhdfhurew4646' + 'toutiao.com' + str(mt)
sign = hashlib.md5(para.encode(encoding='UTF-8')).hexdigest()
#文章url
url = 'http://toutiao.com/item/' + str(item_id)
#回复唯一标识ID
id = comment['comment']['id']
#评论用户名称
user_name = comment['comment']['user_name']
#评论用户头像链接
user_img_url = comment['comment']['user_profile_image_url']
#评论内容text
text = comment['comment']['text']
#评论时间
create_time = comment['comment']['create_time']
#评论内容点赞数
digg_count = comment['comment']['digg_count']
#评论回复数
reply_count = comment['comment']['reply_count']
#获取回复comment
reply_list = []
if reply_count > 0:
offset_reply = 0
self.get_reply_comment(id, reply_list, offset_reply)
items = {
'mt': mt,
'sign': sign,
'arc_url': url,
'nickname': user_name,
'avatar': user_img_url,
'content': text,
'reply': reply_count,
'fabulous': digg_count,
'comment_time': create_time,
'reply_list': json.dumps(reply_list),
}
# 文章评论信息存储
try:
url = 'http://secret/spider/toutiao_comment'
requests.post(url, data=items)
cm_url = 'http://secret/spider/toutiao_comment'
requests.post(cm_url, data=items)
print('ok!!!!')
except Exception as e:
print('insert db wrong!!!!', e)
else:
if total_number <= 50:
return
self.offset += 50
if self.offset > 50:
return
base_time = 1544612400 #随便一个小于当前时间点一天的时间戳
self.get_comment(item_id,group_id,base_time)
def get_reply_comment(self, id, reply_list, offset_reply):
param_data = {
'id': id,
'count': 50,
'offset': offset_reply
}
ua = UserAgent()
headers = {
'Host': 'lf-hl.snssdk.com',
'User-Agent': ua.random,
}
time.sleep(random.randint(1, 2) / 32)
try:
reply_url = 'http://lf-hl.snssdk.com/2/comment/v3/reply_list/?' + urlencode(param_data)
response = requests.get(reply_url, headers=headers,timeout=3)
soup = response.json()
self.parse_reply_comment(soup,id,reply_list,offset_reply)
except Exception as e:
print('reply_comment is wrong', e)
def parse_reply_comment(self,response,id,reply_list,offset_reply):
#判断是否有下一页
has_more = response['data']['has_more']
comments = response['data']['data']
for comment in comments:
print('-------reply comment---------')
#回复内容text
text = comment['text']
#回复时间
create_time = comment['create_time']
#点赞数
digg_count = comment['digg_count']
#用户名
try:
user_name = comment['user']['name']
except:
user_name = comment['user']['screen_name']
#用户头像链接
avatar_url = comment['user']['avatar_url']
#回复的回复内容reply_to_comment
try:
reply_to_text = comment['reply_to_comment']['text']
reply_to_user = comment['reply_to_comment']['user_name']
try:
large_image_list = comment['reply_to_comment']['large_image_list']
url_img_list = []
for url_list in large_image_list:
url_img_list.append(url_list['url'])
except:
url_img_list = []
reply_to_comment = {
'reply_to_text': reply_to_text,
'reply_to_user': reply_to_user,
'url_img_list': url_img_list
}
except:
reply_to_comment = {}
items = {
'nickname': user_name,
'avatar': avatar_url,
'content': text,
'fabulous': digg_count,
'comment_time': create_time,
'reply_to_comment': reply_to_comment
}
reply_list.append(items)
else:
if has_more:
offset_reply += 50
self.get_reply_comment(id,reply_list, offset_reply)
def run(self):
while True:
data = self.redis_cli.spop('spider_toutiao_comment_id')
if data == None:
time.sleep(600)
print('data is None')
continue
data = eval(str(data)) #str转成dict
item_id = data['item_id']
group_id = data['group_id']
save_time = data['save_time']
try:
self.get_comment(item_id,group_id,save_time)
except:
continue
if __name__ == "__main__":
for i in range(5):
t = Comment()
work_thread = Thread(target=t.run)
work_thread.start()