-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawling.py
91 lines (72 loc) · 2.32 KB
/
crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@author: LiuZhi
@time: 2019-01-13 23:16
@contact: [email protected]
@software: PyCharm
"""
from datetime import datetime
from html.parser import HTMLParser
import feedparser
from auto_app import app
from models.core import Post, PostTag, Tag, db
from models.search import Item
class MLStripper(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs = True
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def fetch(url):
d = feedparser.parse(url)
entries = d.entries
posts = []
for entry in entries:
try:
content = entry.content and entry.content[0].value
except AttributeError:
try:
content = entry.summary
except AttributeError:
content = entry.title
try:
created_at = datetime.strptime(entry.published, '%Y-%m-%dT%H:%M:%S.%fZ')
except ValueError:
try:
created_at = datetime.strptime(entry.published, '%Y-%m-%dT%H:%M:%S%fZ')
except ValueError:
created_at = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %z')
try:
tags = entry.tags
except AttributeError:
tags = ['other']
ok, post = Post.create_or_update(
author_id=2, title=entry.title or 'other', orig_url=entry.link,
content=strip_tags(content), created_at=created_at,
tags=[tag.term for tag in tags if tag])
if ok:
posts.append(post)
# Item.add(post)
# 批量更新索引
# Item.bulk_update(posts, op_type='create')
def main():
with app.test_request_context():
Item._index.delete(ignore=404) # 删除Elasticsearch索引,销毁全部数据
Item.init()
for model in (Post, Tag, PostTag):
model.query.delete() # 数据库操作要通过SQLAlchemy,不要直接链接数据库操作
db.session.commit()
for site in ('https://imys.net/atom.xml', ):
fetch(site)
if __name__ == '__main__':
main()