forked from PengWeihb/toutiaoSpider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtoutiaoPage.py
120 lines (108 loc) · 3.64 KB
/
toutiaoPage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding:utf-8 -*-
import urllib3
import time
from bs4 import BeautifulSoup
from pymysql import *
import html
from urllib import request
import re
'''
此代码主要是根据已经爬下来的列表详情页,依据文章的URL抓取文章的内容
'''
def loadLink(source_url,userId):
proxies = {}
try:
proxy_handler = request.ProxyHandler(proxies)
opener = request.build_opener(proxy_handler)
accept = 'https://www.toutiao.com/i' + userId + '/'
opener.addheaders = [
('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36'),
#('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7'),
('Cookie', 'tt_webid=6549097213473031687'),
('Referer', accept),
('Host', 'm.toutiao.com')
]
request.install_opener(opener)
soup = request.urlopen(source_url,timeout=5)
body = soup.read().decode('utf-8')
time.sleep(0.1)
urllib3.disable_warnings()
except:
print('something is wrong!!!')
error_time = int(time.time())
with open('error_url.txt', 'a') as e:
e.write(str(error_time) + '\n')
e.write(source_url + '\n')
print(source_url)
return '[]'
response = BeautifulSoup(body,'lxml')
time.sleep(0.1)
print(response)
try:
content = response.find_all('script')
except:
return '[]'
if len(content) > 6:
time.sleep(0.1)
content = content[6]
content = str(content)
content = content[28:-12]
content = content.strip()
content = content.split('},')
if len(content) > 2:
content = content[2]
content = content.strip()
content = content.split('content:')
if len(content) >= 2:
content = content[1]
content = content.split('groupId:')
content = content[0].strip()
content = content[:-1]
text = content.replace('div><','').replace('</div>','')
text = html.unescape(text)
return text
else:
return '[]'
else:
return '[]'
else:
return '[]'
if __name__ == "__main__":
db = connect(host="secret", port=3306, db="Spider", user="root", password="secret", charset="utf8")
cursor = db.cursor()
try:
sql = '''SELECT id,source_url,article_content FROM toutiaoPage'''
MainUrl = cursor.execute(sql)
data = cursor.fetchall()
print('ok')
db.commit()
except:
db.rollback()
for i in range(len(data)):
id = data[i][0]
url = data[i][1]
content = data[i][2]
pattern = re.compile(r'\d+')
user_id = re.findall(pattern, url)
userId = user_id[0]
'''
if content == '["视频"]':
pass
elif content == '["图集"]':
pass
else:
'''
if content == '[]':
time.sleep(0.1)
page = loadLink(url,userId)
n = id
n = str(n)
params = [page,n]
try:
sql = """update toutiaoPage set article_content=%s where id=%s"""
cursor.execute(sql,params)
db.commit()
except:
db.rollback()
time.sleep(0.2)
db.close()