forked from wb14123/couplet-dataset
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sina_spider.py
43 lines (36 loc) · 1.56 KB
/
sina_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import scrapy
import logging
import re
class CoupletSpider(scrapy.Spider):
name = 'CoupletSpider'
start_urls = ['http://blog.sina.com.cn/s/articlelist_1195052695_2_%d.html' % (i) for i in range(1, 20)]
def parse(self, response):
for href in response.css('.atc_title a ::attr(href)'):
yield response.follow(href.extract(), self.parse_couplets)
def parse_couplets(self, response):
title = response.css('.articalTitle h2 ::text').extract_first()
lines = response.css('.articalContent::text, .articalContent *::text').extract()
output_file = open('output/' + title + '.txt', 'wb')
mid = u'〓'
end = u'◎'
for line in lines:
if (not mid in line) or (not end in line):
continue
line = line.replace(end, '').replace('|', '')
words = re.split(r"[\u200b\s]+", line, flags=re.UNICODE)
for word in words:
if len(word.strip()) == 0:
continue
couplet = word.split(mid)
if len(couplet) != 2:
logging.warning("Error while process " + word)
continue
up, down = couplet
if (not up) or (not down) or len(up) != len(down):
logging.warning("Error while process " + word)
continue
output_file.write((up + u'\n').encode('utf8'))
output_file.write((down + u'\n').encode('utf8'))
output_file.close()