forked from PengWeihb/toutiaoSpider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuserRearch.py
104 lines (88 loc) · 3.13 KB
/
userRearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding:utf-8 -*-
import re
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from MySQLdb import *
'''
使用selenium + PhantomJS的方式,模拟点击不断获取今日头条号主信息
此方法主要抓取头条主页活跃的号主,可以分类地进行抓取
'''
def user_rearch(style,user_list_1):
url = 'https://www.toutiao.com/ch/' + style + '/'
print(url)
print type(user_list_1)
driver = webdriver.PhantomJS()
driver.get(url)
time.sleep(1)
driver.refresh()
driver.implicitly_wait(1)
for i in range(200000):
soup = BeautifulSoup(driver.page_source, 'html.parser')
response = soup.find_all('a',{'class':'lbtn source'})
for links in response:
print links
links = str(links)
pattern = re.compile(r'\d+')
body = re.findall(pattern,links)
if len(body) != 0:
data = body[0]
data = str(data)
data.strip()
user_list2 = user_list_1
print len(user_list2)
user_list2.append(data)
print len(user_list2)
time.sleep(0.5)
new_user_list = list(set(user_list2))
new_user_list.sort(key = user_list2.index)
print len(new_user_list)
if len(new_user_list) == len(user_list_1):
print 'good!'
userId = new_user_list[-1]
userId = str(userId)
MainUrl = 'https://www.toutiao.com/c/user/'+ str(userId) + '/'
mid = 'None'
sql = """insert into Media(MainUrl,userId,mid) value(%s,%s,%s)"""
try:
conn.execute(sql,(MainUrl,userId,mid))
db.commit()
except:
db.rollback()
else:
pass
user_list_1 = new_user_list
else:
pass
print str(i)+' is ok'
time.sleep(1)
if i%100 == 0:
time.sleep(1800)
else:
pass
driver.refresh()
user_list_1 = new_user_list
if __name__ == "__main__":
db = connect(host="localhost", port=3306, db="Spider", user="root", passwd="123456", charset="utf8")
conn = db.cursor()
try:
sql = 'SELECT userId FROM Media'
MainUrl = conn.execute(sql)
data = conn.fetchall()
db.commit()
except:
db.rollback()
user_list = []
for i in range(len(data)):
user_id = data[i][0]
user_id = str(user_id)
user_list.append(user_id)
print user_list
user_set = set(user_list)
user_list_1 = list(user_set)
user_list_1.sort(key = user_list.index)
print len(user_list_1)
# news_game、news_hot、news_tech、news_entertainment、news_sports、news_car、news_finance、news_travel,and so on.
style = 'news_game'
user_rearch(style,user_list_1)
db.close()