Initial commit: a spider to gather training data

dogancanbakir · Sep 2, 2016 · d761247 · d761247
commit d761247
Show file tree

Hide file tree

Showing 13 changed files with 1,306 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,12 @@
+*.pyc
+venv/
+build/
+dist/
+*.egg-info/
+.tox
+.idea
+htmlcov
+.coverage
+.cache
+*.jl.gz
+.ipynb_checkpoints/
diff --git a/README.rst b/README.rst
@@ -0,0 +1,14 @@
+soft404: a classifier for detecting soft 404 pages
+==================================================
+
+A "soft" 404 page is a page that is served with 200 status,
+but is really a page that says that content is not available.
+
+Getting data
+------------
+
+Run the crawler for a while (results will appear in ``items.jl.gz`` file)::
+
+    cd crawler
+    scrapy crawl spider -o gzip:items.jl
+
diff --git a/crawler/exports.py b/crawler/exports.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+import os
+import gzip
+
+from zope.interface import Interface, implementer
+from w3lib.url import file_uri_to_path
+from scrapy.extensions.feedexport import IFeedStorage
+
+
+@implementer(IFeedStorage)
+class GzipFileFeedStorage(object):
+    """
+    Storage which exports data to a gzipped file.
+    To use it, add
+    ::
+        FEED_STORAGES = {
+            'gzip': 'deepdeep.exports.GzipFileFeedStorage',
+        }
+    to settings.py and then run scrapy crawl like this::
+        scrapy crawl foo -o gzip:/path/to/items.jl
+    The command above will create ``/path/to/items.jl.gz`` file
+    (.gz extension is added automatically).
+    Other export formats are also supported, but it is recommended to use .jl.
+    If a spider is killed then gz archive may be partially broken.
+    In this case it user should read the broken archive line-by-line and stop
+    on gzip decoding errors, discarding the tail. It works OK with .jl exports.
+    """
+    COMPRESS_LEVEL = 4
+
+    def __init__(self, uri):
+        self.path = file_uri_to_path(uri) + ".gz"
+
+    def open(self, spider):
+        dirname = os.path.dirname(self.path)
+        if dirname and not os.path.exists(dirname):
+            os.makedirs(dirname, exist_ok=True)
+        return gzip.open(self.path, 'ab', compresslevel=self.COMPRESS_LEVEL)
+
+    def store(self, file):
+        file.close()
diff --git a/crawler/scrapy.cfg b/crawler/scrapy.cfg
@@ -0,0 +1,2 @@
+[settings]
+default = settings
diff --git a/crawler/settings.py b/crawler/settings.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+
+BOT_NAME = 'crawler'
+
+SPIDER_MODULES = ['spiders']
+NEWSPIDER_MODULE = 'spiders'
+
+USER_AGENT = (
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) '
+    'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')
+
+CONCURRENT_REQUESTS = 64
+CONCURRENT_REQUESTS_PER_DOMAIN = 4
+DOWNLOAD_TIMEOUT = 15
+
+COOKIES_ENABLED = False
+TELNETCONSOLE_ENABLED = False
+RETRY_ENABLED = False
+
+DEFAULT_REQUEST_HEADERS = {
+   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+   'Accept-Language': 'en',
+}
+
+FEED_STORAGES = {
+    'gzip': 'exports.GzipFileFeedStorage',
+}
+
+LOG_LEVEL = 'INFO'
+
diff --git a/crawler/spiders.py b/crawler/spiders.py
@@ -0,0 +1,58 @@
+import random
+from six.moves.urllib.parse import urlsplit, urlunsplit
+from string import ascii_lowercase
+
+import scrapy
+from scrapy.linkextractors import LinkExtractor
+
+
+class Spider(scrapy.Spider):
+    name = 'spider'
+    handle_httpstatus_list = [404]
+
+    def __init__(self):
+        self.le = LinkExtractor()
+        with open('top-1k.txt') as f:
+            self.start_urls = ['http://{}'.format(line.strip()) for line in f]
+
+    def parse(self, response):
+        if hasattr(response, 'text'):
+            yield {
+                'url': response.url,
+                'html': response.text,
+                'status': response.status,
+                'headers': response.headers.to_unicode_dict(),
+            }
+            for link in self.le.extract_links(response):
+                yield scrapy.Request(link.url)
+                if random.random() < 0.1:  # get some 404-s
+                    p = urlsplit(link.url)
+                    if len(p.path.strip('/')) > 1:
+                        new_path = mangle_path(p.path)
+                        yield scrapy.Request(urlunsplit(
+                            (p.scheme, p.netloc, new_path, p.query, p.fragment)))
+
+
+def mangle_path(path):
+    """
+    >>> random.seed(1); mangle_path('/a')
+    '/sa'
+    >>> random.seed(1); mangle_path('/afas')
+    '/asfas'
+    >>> random.seed(1); mangle_path('/afas/a/')
+    '/afas/sa/'
+    >>> random.seed(1); mangle_path('/afas/ab')
+    '/afas/sab'
+    >>> random.seed(1); mangle_path('/afas/a/ab')
+    '/afas/a/sab'
+    """
+    lead_path, last_path = path.rstrip('/').rsplit('/', 1)
+    add_idx = random.randint(0, len(last_path))
+    new_last_path = ''.join([
+        last_path[:add_idx],
+        random.choice(ascii_lowercase),
+        last_path[add_idx:]])
+    new_path = '/'.join([lead_path, new_last_path])
+    if path.endswith('/') and not new_path.endswith('/'):
+        new_path += '/'
+    return new_path