diff --git a/README.md b/README.md index c5c00de..d2a7bdb 100644 --- a/README.md +++ b/README.md @@ -51,3 +51,4 @@ Useful options to tweak (add to the above command via ``-s NAME=value``): - ``DOWNLOAD_DELAY`` - set to 0 when crawling local test server - ``RUN_HH`` - set to 0 to skip running full headless-horesman scripts - ``PREFER_PAGINATION`` - set to 0 to disable pagination handling +- ``CDR_CRAWLER``, ``CDR_TEAM`` - CDR export metadata constants diff --git a/undercrawler/items.py b/undercrawler/items.py index 92a97be..3c85107 100644 --- a/undercrawler/items.py +++ b/undercrawler/items.py @@ -1,20 +1,44 @@ import scrapy -class PageItem(scrapy.Item): - url = scrapy.Field() - text = scrapy.Field() - is_page = scrapy.Field() - depth = scrapy.Field() +class CDRItem(scrapy.Item): - def __repr__(self): - return repr({ - 'url': self['url'], - 'is_page': self['is_page'], - 'depth': self['depth'], - }) + # (url)-(crawl timestamp), SHA-256 hashed, UPPERCASE (string) + _id = scrapy.Field() + + # MIME type (multi (strings)) + content_type = scrapy.Field() + + # Text label identifying the software used by the crawler (string) + crawler = scrapy.Field() + + # Tika/other extraction output (object) + # Our suff here: + # forms: forms metadata as extracted by formasaurus + # depth: page depth + # is_page: this is a page reached by pagination + extracted_metadata = scrapy.Field() + + # Tika/other extraction output (string) + extracted_text = scrapy.Field() + # Original source text/html (string) + raw_content = scrapy.Field() -class FormItem(scrapy.Item): + # Text label identifying the team responsible for the crawler (string) + team = scrapy.Field() + + # Timestamp of COLLECTION of data from the web (datetime) + # https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html#built-in-date-formats + timestamp = scrapy.Field() + + # Full URL requested by the crawler (multi (strings)) url = scrapy.Field() - form_type = scrapy.Field() + + # Schema version. This document describes schema version 2.0. (float) + version = scrapy.Field() + + def __repr__(self): + fields = ['_id', 'url', 'timestamp', 'extracted_metadata'] + return ''.format(', '.join( + '{}: {}'.format(f, repr(self[f])) for f in fields)) diff --git a/undercrawler/settings.py b/undercrawler/settings.py index c0688dd..1b4c13e 100644 --- a/undercrawler/settings.py +++ b/undercrawler/settings.py @@ -12,6 +12,9 @@ AUTOLOGIN_URL = 'http://127.0.0.1:8089' AUTOLOGIN_ENABLED = True +CDR_CRAWLER = 'scrapy undercrawler' +CDR_TEAM = 'HG' + PREFER_PAGINATION = True DOWNLOADER_MIDDLEWARES = { diff --git a/undercrawler/spiders/base_spider.py b/undercrawler/spiders/base_spider.py index 85f0e01..3deaef6 100644 --- a/undercrawler/spiders/base_spider.py +++ b/undercrawler/spiders/base_spider.py @@ -1,5 +1,7 @@ import re import contextlib +from datetime import datetime +import hashlib import autopager import formasaurus @@ -8,7 +10,7 @@ from scrapy.utils.url import canonicalize_url from scrapy.utils.python import unique -from ..items import PageItem, FormItem +from ..items import CDRItem class BaseSpider(scrapy.Spider): @@ -35,17 +37,16 @@ def splash_request(self, url, callback=None, **kwargs): def parse(self, response): url = response.url - self.logger.info(url) - yield PageItem( - url=url, - text=response.text, + if not self.link_extractor.matches(url): + return + + forms = formasaurus.extract_forms(response.text) if response.text \ + else [] + yield self.cdr_item(response, dict( is_page=response.meta.get('is_page', False), depth=response.meta.get('depth', None), - ) - if response.text: - for _, meta in formasaurus.extract_forms(response.text): - yield FormItem(url=url, form_type=meta['form']) - self.logger.info('Found a %s form at %s', meta['form'], url) + forms=[meta for _, meta in forms], + )) if self.settings.getbool('PREFER_PAGINATION'): # Follow pagination links; pagination is not a subject of @@ -62,6 +63,25 @@ def parse(self, response): for link in self.link_extractor.extract_links(response): yield self.splash_request(link.url) + def cdr_item(self, response, metadata): + url = response.url + timestamp = int(datetime.utcnow().timestamp() * 1000) + return CDRItem( + _id=hashlib.sha256('{}-{}'.format(url, timestamp).encode('utf-8'))\ + .hexdigest().upper(), + content_type=response.headers['content-type']\ + .decode('ascii', 'ignore'), + crawler=self.settings.get('CDR_CRAWLER'), + extracted_metadata=metadata, + extracted_text='\n'.join( + response.xpath('//body').xpath('string()').extract()), + raw_content=response.text, + team=self.settings.get('CDR_TEAM'), + timestamp=timestamp, + url=url, + version=2.0, + ) + def _normalize_url(self, url): if not url.startswith('http'): url = 'http://' + url