Skip to content

Commit

Permalink
Always use CDR format, add extracted_metadata
Browse files Browse the repository at this point in the history
What was previously stored in PageItem and FormItem
is now stored in extracted_metadata: is_page, depth, forms.
  • Loading branch information
lopuhin committed Mar 16, 2016
1 parent cb37620 commit 5533d16
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 31 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,4 @@ Useful options to tweak (add to the above command via ``-s NAME=value``):
- ``DOWNLOAD_DELAY`` - set to 0 when crawling local test server
- ``RUN_HH`` - set to 0 to skip running full headless-horesman scripts
- ``PREFER_PAGINATION`` - set to 0 to disable pagination handling
- ``CDR_EXPORT`` - set to 0 to disable export in CDR format
- ``CDR_*`` - CDR export constants
- ``CDR_CRAWLER``, ``CDR_TEAM`` - CDR export metadata constants
23 changes: 7 additions & 16 deletions undercrawler/items.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,6 @@
import scrapy


class PageItem(scrapy.Item):
url = scrapy.Field()
text = scrapy.Field()
is_page = scrapy.Field()
depth = scrapy.Field()

def __repr__(self):
return repr({
'url': self['url'],
'is_page': self['is_page'],
'depth': self['depth'],
})


class CDRItem(scrapy.Item):

# (url)-(crawl timestamp), SHA-256 hashed, UPPERCASE (string)
Expand All @@ -27,6 +13,10 @@ class CDRItem(scrapy.Item):
crawler = scrapy.Field()

# Tika/other extraction output (object)
# Our suff here:
# forms: forms metadata as extracted by formasaurus
# depth: page depth
# is_page: this is a page reached by pagination
extracted_metadata = scrapy.Field()

# Tika/other extraction output (string)
Expand All @@ -49,5 +39,6 @@ class CDRItem(scrapy.Item):
version = scrapy.Field()

def __repr__(self):
fields = ['_id', 'url', 'timestamp']
return repr({f: self[f] for f in fields})
fields = ['_id', 'url', 'timestamp', 'extracted_metadata']
return '<CDRItem: {}>'.format(', '.join(
'{}: {}'.format(f, repr(self[f])) for f in fields))
1 change: 0 additions & 1 deletion undercrawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
AUTOLOGIN_URL = 'http://127.0.0.1:8089'
AUTOLOGIN_ENABLED = True

CDR_EXPORT = True
CDR_CRAWLER = 'scrapy undercrawler'
CDR_TEAM = 'HG'

Expand Down
23 changes: 11 additions & 12 deletions undercrawler/spiders/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import hashlib

import autopager
import formasaurus
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.utils.url import canonicalize_url
from scrapy.utils.python import unique

from ..items import PageItem, CDRItem
from ..items import CDRItem


class BaseSpider(scrapy.Spider):
Expand Down Expand Up @@ -39,15 +40,13 @@ def parse(self, response):
if not self.link_extractor.matches(url):
return

if self.settings.getbool('CDR_EXPORT'):
yield self.cdr_item(response)
else:
yield PageItem(
url=url,
text=response.text,
is_page=response.meta.get('is_page', False),
depth=response.meta.get('depth', None),
)
forms = formasaurus.extract_forms(response.text) if response.text \
else []
yield self.cdr_item(response, dict(
is_page=response.meta.get('is_page', False),
depth=response.meta.get('depth', None),
forms=[meta for _, meta in forms],
))

if self.settings.getbool('PREFER_PAGINATION'):
# Follow pagination links; pagination is not a subject of
Expand All @@ -64,7 +63,7 @@ def parse(self, response):
for link in self.link_extractor.extract_links(response):
yield self.splash_request(link.url)

def cdr_item(self, response):
def cdr_item(self, response, metadata):
url = response.url
timestamp = int(datetime.utcnow().timestamp() * 1000)
return CDRItem(
Expand All @@ -73,7 +72,7 @@ def cdr_item(self, response):
content_type=response.headers['content-type']\
.decode('ascii', 'ignore'),
crawler=self.settings.get('CDR_CRAWLER'),
extracted_metadata={},
extracted_metadata=metadata,
extracted_text='\n'.join(
response.xpath('//body').xpath('string()').extract()),
raw_content=response.text,
Expand Down

0 comments on commit 5533d16

Please sign in to comment.