-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathV2EX.py
69 lines (56 loc) · 2.27 KB
/
V2EX.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# created by 10412
# !/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2016-10-20 20:43:00
# Project: V2EX
from pyspider.libs.base_handler import *
import re
import random
import MySQLdb
class Handler(BaseHandler):
crawl_config = {
}
def __init__(self):
self.db = MySQLdb.connect('localhost', 'root', 'root', 'wenda', charset='utf8')
def add_question(self, title, content):
try:
cursor = self.db.cursor()
sql = 'insert into question(title, content, user_id, created_date, comment_count) values ("%s","%s",%d, %s, 0)' % (
title, content, random.randint(1, 10), 'now()');
print sql
cursor.execute(sql)
print cursor.lastrowid
self.db.commit()
except Exception, e:
print e
self.db.rollback()
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.v2ex.com/', callback=self.index_page, validate_cert=False)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="https://www.v2ex.com/?tab="]').items():
self.crawl(each.attr.href, callback=self.tab_page, validate_cert=False)
@config(age=10 * 24 * 60 * 60)
def tab_page(self, response):
for each in response.doc('a[href^="https://www.v2ex.com/go/"]').items():
self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
@config(age=10 * 24 * 60 * 60)
def board_page(self, response):
for each in response.doc('a[href^="https://www.v2ex.com/t/"]').items():
url = each.attr.href
if url.find('#reply') > 0:
url = url[0:url.find('#')]
self.crawl(url, callback=self.detail_page, validate_cert=False)
for each in response.doc('a.page_normal').items():
self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
@config(priority=2)
def detail_page(self, response):
title = response.doc('h1').text()
content = response.doc('div.topic_content').html().replace('"', '\\"')
self.add_question(title, content) # 插入数据库
return {
"url": response.url,
"title": title,
"content": content,
}