Skip to content

Commit 7350dbd

Browse files
authored
Merge pull request #320 from GSA/iis-date-parser
add more date parser for IIS server
2 parents fe13b30 + 0beefa2 commit 7350dbd

File tree

1 file changed

+22
-11
lines changed
  • ckanext/spatial/harvesters

1 file changed

+22
-11
lines changed

ckanext/spatial/harvesters/waf.py

+22-11
Original file line numberDiff line numberDiff line change
@@ -244,17 +244,26 @@ def fetch_stage(self, harvest_object):
244244
,adjacent=False, joinString=' ').setResultsName('date')
245245
)
246246

247-
iis = parse.SkipTo("<br>").suppress() \
248-
+ parse.OneOrMore("<br>").suppress() \
249-
+ parse.Optional(parse.Combine(
250-
parse.Word(parse.alphanums+'/') +
251-
parse.Word(parse.alphanums+':') +
252-
parse.Word(parse.alphas)
253-
, adjacent=False, joinString=' ').setResultsName('date')
254-
) \
255-
+ parse.Word(parse.nums).suppress() \
256-
+ parse.Literal('<A HREF=').suppress() \
257-
+ parse.quotedString.setParseAction(parse.removeQuotes).setResultsName('url')
247+
iis = parse.SkipTo("<br>").suppress() \
248+
+ parse.OneOrMore("<br>").suppress() \
249+
+ parse.Optional(parse.Combine(
250+
parse.Word(parse.alphanums+'/') +
251+
parse.Word(parse.alphanums+':') +
252+
parse.Word(parse.alphas)
253+
, adjacent=False, joinString=' ').setResultsName('date')
254+
) \
255+
+ parse.Optional(parse.Combine(
256+
parse.Word(parse.alphas+',') +
257+
parse.Word(parse.alphas) +
258+
parse.Word(parse.nums+',') +
259+
parse.Word(parse.nums) +
260+
parse.Word(parse.nums+':') +
261+
parse.Word(parse.alphas)
262+
, adjacent=False, joinString=' ').setResultsName('date')
263+
) \
264+
+ parse.Word(parse.nums).suppress() \
265+
+ parse.Literal('<A HREF=').suppress() \
266+
+ parse.quotedString.setParseAction(parse.removeQuotes).setResultsName('url')
258267

259268
other = parse.SkipTo(parse.CaselessLiteral("<a href="), include=True).suppress() \
260269
+ parse.quotedString.setParseAction(parse.removeQuotes).setResultsName('url')
@@ -328,6 +337,8 @@ def _extract_waf(content, base_url, scraper, results = None, depth=0):
328337
except Exception as e:
329338
raise
330339
date = None
340+
if not date:
341+
log.debug('failed to get date for %s', url)
331342
results.append((urljoin(base_url, record.url), date))
332343

333344
return results

0 commit comments

Comments
 (0)