Skip to content

Commit

Permalink
Initial commit: a spider to gather training data
Browse files Browse the repository at this point in the history
  • Loading branch information
lopuhin committed Sep 2, 2016
0 parents commit d761247
Show file tree
Hide file tree
Showing 13 changed files with 1,306 additions and 0 deletions.
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
*.pyc
venv/
build/
dist/
*.egg-info/
.tox
.idea
htmlcov
.coverage
.cache
*.jl.gz
.ipynb_checkpoints/
14 changes: 14 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
soft404: a classifier for detecting soft 404 pages
==================================================

A "soft" 404 page is a page that is served with 200 status,
but is really a page that says that content is not available.

Getting data
------------

Run the crawler for a while (results will appear in ``items.jl.gz`` file)::

cd crawler
scrapy crawl spider -o gzip:items.jl

40 changes: 40 additions & 0 deletions crawler/exports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
import os
import gzip

from zope.interface import Interface, implementer
from w3lib.url import file_uri_to_path
from scrapy.extensions.feedexport import IFeedStorage


@implementer(IFeedStorage)
class GzipFileFeedStorage(object):
"""
Storage which exports data to a gzipped file.
To use it, add
::
FEED_STORAGES = {
'gzip': 'deepdeep.exports.GzipFileFeedStorage',
}
to settings.py and then run scrapy crawl like this::
scrapy crawl foo -o gzip:/path/to/items.jl
The command above will create ``/path/to/items.jl.gz`` file
(.gz extension is added automatically).
Other export formats are also supported, but it is recommended to use .jl.
If a spider is killed then gz archive may be partially broken.
In this case it user should read the broken archive line-by-line and stop
on gzip decoding errors, discarding the tail. It works OK with .jl exports.
"""
COMPRESS_LEVEL = 4

def __init__(self, uri):
self.path = file_uri_to_path(uri) + ".gz"

def open(self, spider):
dirname = os.path.dirname(self.path)
if dirname and not os.path.exists(dirname):
os.makedirs(dirname, exist_ok=True)
return gzip.open(self.path, 'ab', compresslevel=self.COMPRESS_LEVEL)

def store(self, file):
file.close()
2 changes: 2 additions & 0 deletions crawler/scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[settings]
default = settings
30 changes: 30 additions & 0 deletions crawler/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-

BOT_NAME = 'crawler'

SPIDER_MODULES = ['spiders']
NEWSPIDER_MODULE = 'spiders'

USER_AGENT = (
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')

CONCURRENT_REQUESTS = 64
CONCURRENT_REQUESTS_PER_DOMAIN = 4
DOWNLOAD_TIMEOUT = 15

COOKIES_ENABLED = False
TELNETCONSOLE_ENABLED = False
RETRY_ENABLED = False

DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}

FEED_STORAGES = {
'gzip': 'exports.GzipFileFeedStorage',
}

LOG_LEVEL = 'INFO'

58 changes: 58 additions & 0 deletions crawler/spiders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import random
from six.moves.urllib.parse import urlsplit, urlunsplit
from string import ascii_lowercase

import scrapy
from scrapy.linkextractors import LinkExtractor


class Spider(scrapy.Spider):
name = 'spider'
handle_httpstatus_list = [404]

def __init__(self):
self.le = LinkExtractor()
with open('top-1k.txt') as f:
self.start_urls = ['http://{}'.format(line.strip()) for line in f]

def parse(self, response):
if hasattr(response, 'text'):
yield {
'url': response.url,
'html': response.text,
'status': response.status,
'headers': response.headers.to_unicode_dict(),
}
for link in self.le.extract_links(response):
yield scrapy.Request(link.url)
if random.random() < 0.1: # get some 404-s
p = urlsplit(link.url)
if len(p.path.strip('/')) > 1:
new_path = mangle_path(p.path)
yield scrapy.Request(urlunsplit(
(p.scheme, p.netloc, new_path, p.query, p.fragment)))


def mangle_path(path):
"""
>>> random.seed(1); mangle_path('/a')
'/sa'
>>> random.seed(1); mangle_path('/afas')
'/asfas'
>>> random.seed(1); mangle_path('/afas/a/')
'/afas/sa/'
>>> random.seed(1); mangle_path('/afas/ab')
'/afas/sab'
>>> random.seed(1); mangle_path('/afas/a/ab')
'/afas/a/sab'
"""
lead_path, last_path = path.rstrip('/').rsplit('/', 1)
add_idx = random.randint(0, len(last_path))
new_last_path = ''.join([
last_path[:add_idx],
random.choice(ascii_lowercase),
last_path[add_idx:]])
new_path = '/'.join([lead_path, new_last_path])
if path.endswith('/') and not new_path.endswith('/'):
new_path += '/'
return new_path
Loading

0 comments on commit d761247

Please sign in to comment.