Skip to content

Commit 4acd538

Browse files
committed
parsers: more tests & logging
1 parent d339dc8 commit 4acd538

File tree

4 files changed

+21
-5
lines changed

4 files changed

+21
-5
lines changed

.coveragerc

-1
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,3 @@ exclude_lines =
1111
if __name__ == .__main__.:
1212
except .*ImportError.*:
1313
except .*UnicodeDecodeError.*:
14-
except .*urllib3.exceptions.*:

htmldate/core.py

+1
Original file line numberDiff line numberDiff line change
@@ -670,6 +670,7 @@ def find_date(htmlobject, extensive_search=True, original_date=False, outputform
670670
# rare LXML error: no NULL bytes or control characters
671671
except ValueError:
672672
cleaned_html = tree
673+
LOGGER.error('lxml cleaner error')
673674
# robust conversion to string
674675
try:
675676
htmlstring = tostring(cleaned_html, pretty_print=False, encoding='unicode')

htmldate/extractors.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,7 @@ def custom_parse(string, outputformat, extensive_search, min_date, max_date):
349349
LOGGER.debug('custom parse result: %s', dateobject)
350350
return dateobject.strftime(outputformat)
351351
except ValueError as err:
352-
LOGGER.debug('value error during conversion: %s %s', string, err)
352+
LOGGER.error('value error during conversion: %s %s', string, err)
353353

354354
return None
355355

@@ -360,8 +360,9 @@ def external_date_parser(string, outputformat):
360360
try:
361361
target = EXTERNAL_PARSER.get_date_data(string)['date_obj']
362362
# 2 types of errors possible
363-
except (OverflowError, ValueError):
363+
except (OverflowError, ValueError) as err:
364364
target = None
365+
LOGGER.error('external parser error: %s %s', string, err)
365366
# issue with data type
366367
if target is not None:
367368
return datetime.date.strftime(target, outputformat)

tests/unit_tests.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
from htmldate.cli import examine, main, parse_args, process_args
3030
from htmldate.core import compare_reference, examine_date_elements, find_date, search_page, search_pattern, select_candidate, try_ymd_date
31-
from htmldate.extractors import DATE_EXPRESSIONS, custom_parse, external_date_parser, extract_partial_url_date, regex_parse
31+
from htmldate.extractors import DATE_EXPRESSIONS, custom_parse, discard_unwanted, external_date_parser, extract_partial_url_date, regex_parse
3232
from htmldate.settings import MIN_DATE, LATEST_POSSIBLE
3333
from htmldate.utils import decode_response, detect_encoding, fetch_url, load_html, is_dubious_html
3434
from htmldate.validators import convert_date, date_validator, get_max_date, get_min_date, output_format_validator
@@ -176,7 +176,8 @@ def test_sanity():
176176
assert output_format_validator('ABC') is False
177177
assert output_format_validator(123) is False
178178
#assert output_format_validator('%\xaa') is False
179-
179+
_, discarded = discard_unwanted(html.fromstring('<html><body><div id="wm-ipp">000</div><div>AAA</div></body></html>'))
180+
assert len(discarded) == 1
180181

181182

182183
def test_no_date():
@@ -504,6 +505,11 @@ def test_regex_parse():
504505
assert custom_parse('abcd 2004-2 efgh', OUTPUTFORMAT, True, MIN_DATE, LATEST_POSSIBLE) is not None
505506
assert custom_parse('abcd 2004-2 efgh', OUTPUTFORMAT, True, MIN_DATE, LATEST_POSSIBLE) is not None
506507
assert custom_parse('abcd 32. Januar 2020 efgh', OUTPUTFORMAT, True, MIN_DATE, LATEST_POSSIBLE) is None
508+
# plausible but impossible dates
509+
assert custom_parse('February 29 2008', OUTPUTFORMAT, False, MIN_DATE, LATEST_POSSIBLE) == '2008-02-29'
510+
assert custom_parse('February 30 2008', OUTPUTFORMAT, False, MIN_DATE, LATEST_POSSIBLE) is None
511+
assert custom_parse('XXTag, den 29. Februar 2008', OUTPUTFORMAT, False, MIN_DATE, LATEST_POSSIBLE) == '2008-02-29'
512+
assert custom_parse('XXTag, den 30. Februar 2008', OUTPUTFORMAT, False, MIN_DATE, LATEST_POSSIBLE) is None
507513
#for Nones caused by newlines and duplicates
508514
assert regex_parse("January 1st, 1998") is not None
509515
assert regex_parse("February 1st, 1998") is not None
@@ -633,6 +639,15 @@ def test_external_date_parser():
633639
'''test external date parser'''
634640
assert external_date_parser('Wednesday, January 1st 2020', OUTPUTFORMAT) == '2020-01-01'
635641
assert external_date_parser('Random text with 2020', OUTPUTFORMAT) is None
642+
# https://github.com/scrapinghub/dateparser/issues/333
643+
assert external_date_parser('1 January 0001', '%d %B %Y') == '01 January 1'
644+
# https://github.com/scrapinghub/dateparser/issues/406
645+
assert external_date_parser('2018-04-12 17:20:03.12345678999a', OUTPUTFORMAT) == '2018-12-04'
646+
# https://github.com/scrapinghub/dateparser/issues/685
647+
assert external_date_parser('12345678912 days', OUTPUTFORMAT) is None
648+
# https://github.com/scrapinghub/dateparser/issues/680
649+
assert external_date_parser('2.2250738585072011e-308', OUTPUTFORMAT) is None
650+
assert external_date_parser('⁰⁴⁵₀₁₂', OUTPUTFORMAT) is None
636651

637652

638653
def test_url():

0 commit comments

Comments
 (0)