forked from PengWeihb/toutiaoSpider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtoutiaoPage2.py
115 lines (102 loc) · 4.12 KB
/
toutiaoPage2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding:utf-8 -*-
import requests
import urllib3
import time
from bs4 import BeautifulSoup
from pymysql import *
import html
import re
import math
import random
'''
使用Python3特有的爬虫库requests爬取今日头条文章内容,且使用IP代理实现批量爬取
'''
def loadLink(source_url,userId):
accept = 'https://www.toutiao.com/i' + userId + '/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331',
#'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Host': 'www.toutiao.com',
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
'Referer': accept,
'Connection': 'keep-alive',
'Accept - Encoding': 'gzip, deflate,br',
'Accept-Language': 'zh-CN,zh;q=0.9',
#'Cookie':'tt_webid=6558223951607875079; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16385a3fcb76d9-0a084656577c928-77256752-1fa400-16385a3fcb8440; CNZZDATA1259612802=560349922-1526952779-https%253A%252F%252Fwww.baidu.com%252F%7C1526958179; tt_webid=6558223951607875079; uuid="w:620c77872f314457ac7474a47bc06f4a"; __tasessionId=8chi0s5jf1526959969608'
'Cookie': 'uuid="w:bfab8d0a69ca4e989faa722278b8c70f"; UM_distinctid=163252b864a2e1-0d3fc10de-554c162f-1fa400-163252b864b32f; _ga=GA1.2.627472936.1527212834; tt_webid=6549099449222137358; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __tasessionId=lbwjy1jvf1527578479037; tt_webid=6549099449222137358; CNZZDATA1259612802=207427604-1525336204-%7C1527578368'
}
proxies = {"http": "http://60.186.255.172:1246"}
try:
body = requests.get(source_url,headers=headers,proxies=proxies,timeout=5,verify=False).text
urllib3.disable_warnings()
time.sleep(0.1)
except:
print('something is wrong!!!')
error_time = int(time.time())
with open('error_url.txt', 'a') as e:
e.write(str(error_time) + '\n')
e.write(source_url + '\n')
print(url)
return '[]'
response = BeautifulSoup(body,'lxml')
time.sleep(0.1)
print(response)
try:
content = response.find_all('script')
except:
return '[]'
if len(content) >= 6:
time.sleep(0.1)
content = content[6]
content = str(content)
content = content[28:-12]
content = content.strip()
content = content.split('},')
if len(content) > 2:
content = content[2]
content = content.strip()
content = content.split('content:')
if len(content) >= 2:
content = content[1]
content = content.split('groupId:')
content = content[0].strip()
content = content[:-1]
text = content.replace('div><','').replace('</div>','')
text = html.unescape(text)
#text = str(text)
return text
else:
return str(content)
else:
return str(content)
else:
return '[]'
if __name__ == "__main__":
db = connect(host="localhost", port=3306, db="spider", user="root", password="secret", charset="utf8")
cursor = db.cursor()
try:
sql = 'SELECT source_url FROM toutiao'
MainUrl = cursor.execute(sql)
data = cursor.fetchall()
db.commit()
except:
db.rollback()
for i in range(len(data)):
url = data[i][0]
pattern = re.compile(r'\d+')
user_id = re.findall(pattern,url)
userId = user_id[0]
print(userId)
#time.sleep(0.5)
page = loadLink(url,userId)
n = i + 324001
n = str(n)
params = [page,n]
try:
sql = """update toutiao set article_content=%s WHERE id=%s"""
cursor.execute(sql,params)
db.commit()
except:
db.rollback()
time.sleep(0.2)
db.close()