Skip to content

Commit 6ae2297

Browse files
committedJul 26, 2018
qiubai
1 parent 71ca013 commit 6ae2297

File tree

1 file changed

+54
-0
lines changed

1 file changed

+54
-0
lines changed
 

‎qiubai_crawer.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
4+
5+
def download_page(url):
6+
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
7+
r = requests.get(url, headers=headers)
8+
return r.text
9+
10+
11+
def get_content(html, page):
12+
output = """第{}页 作者:{} 性别:{} 年龄:{} 点赞:{} 评论:{}\n{}\n------------\n"""
13+
soup = BeautifulSoup(html, 'html.parser')
14+
con = soup.find(id='content-left')
15+
con_list = con.find_all('div', class_="article")
16+
for i in con_list:
17+
author = i.find('h2').string # 获取作者名字
18+
content = i.find('div', class_='content').find('span').get_text() # 获取内容
19+
stats = i.find('div', class_='stats')
20+
vote = stats.find('span', class_='stats-vote').find('i', class_='number').string
21+
comment = stats.find('span', class_='stats-comments').find('i', class_='number').string
22+
author_info = i.find('div', class_='articleGender') # 获取作者 年龄,性别
23+
if author_info is not None: # 非匿名用户
24+
class_list = author_info['class']
25+
if "womenIcon" in class_list:
26+
gender = '女'
27+
elif "manIcon" in class_list:
28+
gender = '男'
29+
else:
30+
gender = ''
31+
age = author_info.string # 获取年龄
32+
else: # 匿名用户
33+
gender = ''
34+
age = ''
35+
36+
save_txt(output.format(page, author, gender, age, vote, comment, content))
37+
38+
39+
def save_txt(*args):
40+
for i in args:
41+
with open('qiubai.txt', 'a', encoding='utf-8') as f:
42+
f.write(i)
43+
44+
45+
def main():
46+
# 我们点击下面链接,在页面下方可以看到共有13页,可以构造如下 url,
47+
# 当然我们最好是用 Beautiful Soup找到页面底部有多少页。
48+
for i in range(1, 14):
49+
url = 'https://qiushibaike.com/text/page/{}'.format(i)
50+
html = download_page(url)
51+
get_content(html, i)
52+
53+
if __name__ == '__main__':
54+
main()

0 commit comments

Comments
 (0)
Please sign in to comment.