From 06ce8f3a8dadb7d23c7a691ad55126f57051dbdc Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Wed, 16 Mar 2016 14:01:13 +0300 Subject: [PATCH 1/3] Export in CDRv2 format Also remove export of found forms, and do not save pages from other domains. --- README.md | 2 ++ undercrawler/items.py | 37 +++++++++++++++++++++-- undercrawler/settings.py | 4 +++ undercrawler/spiders/base_spider.py | 47 +++++++++++++++++++++-------- 4 files changed, 75 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index c5c00de..36e9744 100644 --- a/README.md +++ b/README.md @@ -51,3 +51,5 @@ Useful options to tweak (add to the above command via ``-s NAME=value``): - ``DOWNLOAD_DELAY`` - set to 0 when crawling local test server - ``RUN_HH`` - set to 0 to skip running full headless-horesman scripts - ``PREFER_PAGINATION`` - set to 0 to disable pagination handling +- ``CDR_EXPORT`` - set to 0 to disable export in CDR format +- ``CDR_*`` - CDR export constants diff --git a/undercrawler/items.py b/undercrawler/items.py index 92a97be..1d0355c 100644 --- a/undercrawler/items.py +++ b/undercrawler/items.py @@ -15,6 +15,39 @@ def __repr__(self): }) -class FormItem(scrapy.Item): +class CDRItem(scrapy.Item): + + # (url)-(crawl timestamp), SHA-256 hashed, UPPERCASE (string) + _id = scrapy.Field() + + # MIME type (multi (strings)) + content_type = scrapy.Field() + + # Text label identifying the software used by the crawler (string) + crawler = scrapy.Field() + + # Tika/other extraction output (object) + extracted_metadata = scrapy.Field() + + # Tika/other extraction output (string) + extracted_text = scrapy.Field() + + # Original source text/html (string) + raw_content = scrapy.Field() + + # Text label identifying the team responsible for the crawler (string) + team = scrapy.Field() + + # Timestamp of COLLECTION of data from the web (datetime) + # https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html#built-in-date-formats + timestamp = scrapy.Field() + + # Full URL requested by the crawler (multi (strings)) url = scrapy.Field() - form_type = scrapy.Field() + + # Schema version. This document describes schema version 2.0. (float) + version = scrapy.Field() + + def __repr__(self): + fields = ['_id', 'url', 'timestamp'] + return repr({f: self[f] for f in fields}) diff --git a/undercrawler/settings.py b/undercrawler/settings.py index c0688dd..368c13b 100644 --- a/undercrawler/settings.py +++ b/undercrawler/settings.py @@ -12,6 +12,10 @@ AUTOLOGIN_URL = 'http://127.0.0.1:8089' AUTOLOGIN_ENABLED = True +CDR_EXPORT = True +CDR_CRAWLER = 'scrapy undercrawler' +CDR_TEAM = 'HG' + PREFER_PAGINATION = True DOWNLOADER_MIDDLEWARES = { diff --git a/undercrawler/spiders/base_spider.py b/undercrawler/spiders/base_spider.py index e325c4d..e5136e3 100644 --- a/undercrawler/spiders/base_spider.py +++ b/undercrawler/spiders/base_spider.py @@ -1,14 +1,15 @@ import re import contextlib +from datetime import datetime +import hashlib import autopager -import formasaurus import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.utils.url import canonicalize_url from scrapy.utils.python import unique -from ..items import PageItem, FormItem +from ..items import PageItem, CDRItem class BaseSpider(scrapy.Spider): @@ -35,17 +36,18 @@ def splash_request(self, url, callback=None, **kwargs): def parse(self, response): url = response.url - self.logger.info(url) - yield PageItem( - url=url, - text=response.text, - is_page=response.meta.get('is_page', False), - depth=response.meta.get('depth', None), - ) - if response.text: - for _, meta in formasaurus.extract_forms(response.text): - yield FormItem(url=url, form_type=meta['form']) - self.logger.info('Found a %s form at %s', meta['form'], url) + if not self.link_extractor.matches(url): + return + + if self.settings.getbool('CDR_EXPORT'): + yield self.cdr_item(response) + else: + yield PageItem( + url=url, + text=response.text, + is_page=response.meta.get('is_page', False), + depth=response.meta.get('depth', None), + ) if self.settings.getbool('PREFER_PAGINATION'): # Follow pagination links; pagination is not a subject of @@ -62,6 +64,25 @@ def parse(self, response): for link in self.link_extractor.extract_links(response): yield self.splash_request(link.url) + def cdr_item(self, response): + url = response.url + timestamp = int(datetime.utcnow().timestamp() * 1000) + return CDRItem( + _id=hashlib.sha256('{}-{}'.format(url, timestamp).encode('utf-8'))\ + .hexdigest().upper(), + content_type=response.headers['content-type']\ + .decode('ascii', 'ignore'), + crawler=self.settings.get('CDR_CRAWLER'), + extracted_metadata={}, + extracted_text='\n'.join( + response.xpath('//body//text()').extract()), + raw_content=response.text, + team=self.settings.get('CDR_TEAM'), + timestamp=timestamp, + url=url, + version=2.0, + ) + def _normalize_url(self, url): if not url.startswith('http'): url = 'http://' + url From cb37620508c046be9099a87eee6c109ce1e0bdbd Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Wed, 16 Mar 2016 15:08:44 +0300 Subject: [PATCH 2/3] Extract text using string() xpath selector Following @kmike suggestion. This gives cleaner output with less extra newlines. --- undercrawler/spiders/base_spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/undercrawler/spiders/base_spider.py b/undercrawler/spiders/base_spider.py index e5136e3..2d0bd55 100644 --- a/undercrawler/spiders/base_spider.py +++ b/undercrawler/spiders/base_spider.py @@ -75,7 +75,7 @@ def cdr_item(self, response): crawler=self.settings.get('CDR_CRAWLER'), extracted_metadata={}, extracted_text='\n'.join( - response.xpath('//body//text()').extract()), + response.xpath('//body').xpath('string()').extract()), raw_content=response.text, team=self.settings.get('CDR_TEAM'), timestamp=timestamp, From 5533d1640b2326fa6b77cbe97547520671b10bc2 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Wed, 16 Mar 2016 15:46:29 +0300 Subject: [PATCH 3/3] Always use CDR format, add extracted_metadata What was previously stored in PageItem and FormItem is now stored in extracted_metadata: is_page, depth, forms. --- README.md | 3 +-- undercrawler/items.py | 23 +++++++---------------- undercrawler/settings.py | 1 - undercrawler/spiders/base_spider.py | 23 +++++++++++------------ 4 files changed, 19 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 36e9744..d2a7bdb 100644 --- a/README.md +++ b/README.md @@ -51,5 +51,4 @@ Useful options to tweak (add to the above command via ``-s NAME=value``): - ``DOWNLOAD_DELAY`` - set to 0 when crawling local test server - ``RUN_HH`` - set to 0 to skip running full headless-horesman scripts - ``PREFER_PAGINATION`` - set to 0 to disable pagination handling -- ``CDR_EXPORT`` - set to 0 to disable export in CDR format -- ``CDR_*`` - CDR export constants +- ``CDR_CRAWLER``, ``CDR_TEAM`` - CDR export metadata constants diff --git a/undercrawler/items.py b/undercrawler/items.py index 1d0355c..3c85107 100644 --- a/undercrawler/items.py +++ b/undercrawler/items.py @@ -1,20 +1,6 @@ import scrapy -class PageItem(scrapy.Item): - url = scrapy.Field() - text = scrapy.Field() - is_page = scrapy.Field() - depth = scrapy.Field() - - def __repr__(self): - return repr({ - 'url': self['url'], - 'is_page': self['is_page'], - 'depth': self['depth'], - }) - - class CDRItem(scrapy.Item): # (url)-(crawl timestamp), SHA-256 hashed, UPPERCASE (string) @@ -27,6 +13,10 @@ class CDRItem(scrapy.Item): crawler = scrapy.Field() # Tika/other extraction output (object) + # Our suff here: + # forms: forms metadata as extracted by formasaurus + # depth: page depth + # is_page: this is a page reached by pagination extracted_metadata = scrapy.Field() # Tika/other extraction output (string) @@ -49,5 +39,6 @@ class CDRItem(scrapy.Item): version = scrapy.Field() def __repr__(self): - fields = ['_id', 'url', 'timestamp'] - return repr({f: self[f] for f in fields}) + fields = ['_id', 'url', 'timestamp', 'extracted_metadata'] + return ''.format(', '.join( + '{}: {}'.format(f, repr(self[f])) for f in fields)) diff --git a/undercrawler/settings.py b/undercrawler/settings.py index 368c13b..1b4c13e 100644 --- a/undercrawler/settings.py +++ b/undercrawler/settings.py @@ -12,7 +12,6 @@ AUTOLOGIN_URL = 'http://127.0.0.1:8089' AUTOLOGIN_ENABLED = True -CDR_EXPORT = True CDR_CRAWLER = 'scrapy undercrawler' CDR_TEAM = 'HG' diff --git a/undercrawler/spiders/base_spider.py b/undercrawler/spiders/base_spider.py index 2d0bd55..889daf7 100644 --- a/undercrawler/spiders/base_spider.py +++ b/undercrawler/spiders/base_spider.py @@ -4,12 +4,13 @@ import hashlib import autopager +import formasaurus import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.utils.url import canonicalize_url from scrapy.utils.python import unique -from ..items import PageItem, CDRItem +from ..items import CDRItem class BaseSpider(scrapy.Spider): @@ -39,15 +40,13 @@ def parse(self, response): if not self.link_extractor.matches(url): return - if self.settings.getbool('CDR_EXPORT'): - yield self.cdr_item(response) - else: - yield PageItem( - url=url, - text=response.text, - is_page=response.meta.get('is_page', False), - depth=response.meta.get('depth', None), - ) + forms = formasaurus.extract_forms(response.text) if response.text \ + else [] + yield self.cdr_item(response, dict( + is_page=response.meta.get('is_page', False), + depth=response.meta.get('depth', None), + forms=[meta for _, meta in forms], + )) if self.settings.getbool('PREFER_PAGINATION'): # Follow pagination links; pagination is not a subject of @@ -64,7 +63,7 @@ def parse(self, response): for link in self.link_extractor.extract_links(response): yield self.splash_request(link.url) - def cdr_item(self, response): + def cdr_item(self, response, metadata): url = response.url timestamp = int(datetime.utcnow().timestamp() * 1000) return CDRItem( @@ -73,7 +72,7 @@ def cdr_item(self, response): content_type=response.headers['content-type']\ .decode('ascii', 'ignore'), crawler=self.settings.get('CDR_CRAWLER'), - extracted_metadata={}, + extracted_metadata=metadata, extracted_text='\n'.join( response.xpath('//body').xpath('string()').extract()), raw_content=response.text,