forked from TeamHG-Memex/soft404
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial commit: a spider to gather training data
- Loading branch information
0 parents
commit d761247
Showing
13 changed files
with
1,306 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
*.pyc | ||
venv/ | ||
build/ | ||
dist/ | ||
*.egg-info/ | ||
.tox | ||
.idea | ||
htmlcov | ||
.coverage | ||
.cache | ||
*.jl.gz | ||
.ipynb_checkpoints/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
soft404: a classifier for detecting soft 404 pages | ||
================================================== | ||
|
||
A "soft" 404 page is a page that is served with 200 status, | ||
but is really a page that says that content is not available. | ||
|
||
Getting data | ||
------------ | ||
|
||
Run the crawler for a while (results will appear in ``items.jl.gz`` file):: | ||
|
||
cd crawler | ||
scrapy crawl spider -o gzip:items.jl | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# -*- coding: utf-8 -*- | ||
import os | ||
import gzip | ||
|
||
from zope.interface import Interface, implementer | ||
from w3lib.url import file_uri_to_path | ||
from scrapy.extensions.feedexport import IFeedStorage | ||
|
||
|
||
@implementer(IFeedStorage) | ||
class GzipFileFeedStorage(object): | ||
""" | ||
Storage which exports data to a gzipped file. | ||
To use it, add | ||
:: | ||
FEED_STORAGES = { | ||
'gzip': 'deepdeep.exports.GzipFileFeedStorage', | ||
} | ||
to settings.py and then run scrapy crawl like this:: | ||
scrapy crawl foo -o gzip:/path/to/items.jl | ||
The command above will create ``/path/to/items.jl.gz`` file | ||
(.gz extension is added automatically). | ||
Other export formats are also supported, but it is recommended to use .jl. | ||
If a spider is killed then gz archive may be partially broken. | ||
In this case it user should read the broken archive line-by-line and stop | ||
on gzip decoding errors, discarding the tail. It works OK with .jl exports. | ||
""" | ||
COMPRESS_LEVEL = 4 | ||
|
||
def __init__(self, uri): | ||
self.path = file_uri_to_path(uri) + ".gz" | ||
|
||
def open(self, spider): | ||
dirname = os.path.dirname(self.path) | ||
if dirname and not os.path.exists(dirname): | ||
os.makedirs(dirname, exist_ok=True) | ||
return gzip.open(self.path, 'ab', compresslevel=self.COMPRESS_LEVEL) | ||
|
||
def store(self, file): | ||
file.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[settings] | ||
default = settings |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
BOT_NAME = 'crawler' | ||
|
||
SPIDER_MODULES = ['spiders'] | ||
NEWSPIDER_MODULE = 'spiders' | ||
|
||
USER_AGENT = ( | ||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) ' | ||
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36') | ||
|
||
CONCURRENT_REQUESTS = 64 | ||
CONCURRENT_REQUESTS_PER_DOMAIN = 4 | ||
DOWNLOAD_TIMEOUT = 15 | ||
|
||
COOKIES_ENABLED = False | ||
TELNETCONSOLE_ENABLED = False | ||
RETRY_ENABLED = False | ||
|
||
DEFAULT_REQUEST_HEADERS = { | ||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | ||
'Accept-Language': 'en', | ||
} | ||
|
||
FEED_STORAGES = { | ||
'gzip': 'exports.GzipFileFeedStorage', | ||
} | ||
|
||
LOG_LEVEL = 'INFO' | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import random | ||
from six.moves.urllib.parse import urlsplit, urlunsplit | ||
from string import ascii_lowercase | ||
|
||
import scrapy | ||
from scrapy.linkextractors import LinkExtractor | ||
|
||
|
||
class Spider(scrapy.Spider): | ||
name = 'spider' | ||
handle_httpstatus_list = [404] | ||
|
||
def __init__(self): | ||
self.le = LinkExtractor() | ||
with open('top-1k.txt') as f: | ||
self.start_urls = ['http://{}'.format(line.strip()) for line in f] | ||
|
||
def parse(self, response): | ||
if hasattr(response, 'text'): | ||
yield { | ||
'url': response.url, | ||
'html': response.text, | ||
'status': response.status, | ||
'headers': response.headers.to_unicode_dict(), | ||
} | ||
for link in self.le.extract_links(response): | ||
yield scrapy.Request(link.url) | ||
if random.random() < 0.1: # get some 404-s | ||
p = urlsplit(link.url) | ||
if len(p.path.strip('/')) > 1: | ||
new_path = mangle_path(p.path) | ||
yield scrapy.Request(urlunsplit( | ||
(p.scheme, p.netloc, new_path, p.query, p.fragment))) | ||
|
||
|
||
def mangle_path(path): | ||
""" | ||
>>> random.seed(1); mangle_path('/a') | ||
'/sa' | ||
>>> random.seed(1); mangle_path('/afas') | ||
'/asfas' | ||
>>> random.seed(1); mangle_path('/afas/a/') | ||
'/afas/sa/' | ||
>>> random.seed(1); mangle_path('/afas/ab') | ||
'/afas/sab' | ||
>>> random.seed(1); mangle_path('/afas/a/ab') | ||
'/afas/a/sab' | ||
""" | ||
lead_path, last_path = path.rstrip('/').rsplit('/', 1) | ||
add_idx = random.randint(0, len(last_path)) | ||
new_last_path = ''.join([ | ||
last_path[:add_idx], | ||
random.choice(ascii_lowercase), | ||
last_path[add_idx:]]) | ||
new_path = '/'.join([lead_path, new_last_path]) | ||
if path.endswith('/') and not new_path.endswith('/'): | ||
new_path += '/' | ||
return new_path |
Oops, something went wrong.