@@ -244,17 +244,26 @@ def fetch_stage(self, harvest_object):
244
244
,adjacent = False , joinString = ' ' ).setResultsName ('date' )
245
245
)
246
246
247
- iis = parse .SkipTo ("<br>" ).suppress () \
248
- + parse .OneOrMore ("<br>" ).suppress () \
249
- + parse .Optional (parse .Combine (
250
- parse .Word (parse .alphanums + '/' ) +
251
- parse .Word (parse .alphanums + ':' ) +
252
- parse .Word (parse .alphas )
253
- , adjacent = False , joinString = ' ' ).setResultsName ('date' )
254
- ) \
255
- + parse .Word (parse .nums ).suppress () \
256
- + parse .Literal ('<A HREF=' ).suppress () \
257
- + parse .quotedString .setParseAction (parse .removeQuotes ).setResultsName ('url' )
247
+ iis = parse .SkipTo ("<br>" ).suppress () \
248
+ + parse .OneOrMore ("<br>" ).suppress () \
249
+ + parse .Optional (parse .Combine (
250
+ parse .Word (parse .alphanums + '/' ) +
251
+ parse .Word (parse .alphanums + ':' ) +
252
+ parse .Word (parse .alphas )
253
+ , adjacent = False , joinString = ' ' ).setResultsName ('date' )
254
+ ) \
255
+ + parse .Optional (parse .Combine (
256
+ parse .Word (parse .alphas + ',' ) +
257
+ parse .Word (parse .alphas ) +
258
+ parse .Word (parse .nums + ',' ) +
259
+ parse .Word (parse .nums ) +
260
+ parse .Word (parse .nums + ':' ) +
261
+ parse .Word (parse .alphas )
262
+ , adjacent = False , joinString = ' ' ).setResultsName ('date' )
263
+ ) \
264
+ + parse .Word (parse .nums ).suppress () \
265
+ + parse .Literal ('<A HREF=' ).suppress () \
266
+ + parse .quotedString .setParseAction (parse .removeQuotes ).setResultsName ('url' )
258
267
259
268
other = parse .SkipTo (parse .CaselessLiteral ("<a href=" ), include = True ).suppress () \
260
269
+ parse .quotedString .setParseAction (parse .removeQuotes ).setResultsName ('url' )
@@ -328,6 +337,8 @@ def _extract_waf(content, base_url, scraper, results = None, depth=0):
328
337
except Exception as e :
329
338
raise
330
339
date = None
340
+ if not date :
341
+ log .debug ('failed to get date for %s' , url )
331
342
results .append ((urljoin (base_url , record .url ), date ))
332
343
333
344
return results
0 commit comments