|
28 | 28 |
|
29 | 29 | from htmldate.cli import examine, main, parse_args, process_args
|
30 | 30 | from htmldate.core import compare_reference, examine_date_elements, find_date, search_page, search_pattern, select_candidate, try_ymd_date
|
31 |
| -from htmldate.extractors import DATE_EXPRESSIONS, custom_parse, external_date_parser, extract_partial_url_date, regex_parse |
| 31 | +from htmldate.extractors import DATE_EXPRESSIONS, custom_parse, discard_unwanted, external_date_parser, extract_partial_url_date, regex_parse |
32 | 32 | from htmldate.settings import MIN_DATE, LATEST_POSSIBLE
|
33 | 33 | from htmldate.utils import decode_response, detect_encoding, fetch_url, load_html, is_dubious_html
|
34 | 34 | from htmldate.validators import convert_date, date_validator, get_max_date, get_min_date, output_format_validator
|
@@ -176,7 +176,8 @@ def test_sanity():
|
176 | 176 | assert output_format_validator('ABC') is False
|
177 | 177 | assert output_format_validator(123) is False
|
178 | 178 | #assert output_format_validator('%\xaa') is False
|
179 |
| - |
| 179 | + _, discarded = discard_unwanted(html.fromstring('<html><body><div id="wm-ipp">000</div><div>AAA</div></body></html>')) |
| 180 | + assert len(discarded) == 1 |
180 | 181 |
|
181 | 182 |
|
182 | 183 | def test_no_date():
|
@@ -504,6 +505,11 @@ def test_regex_parse():
|
504 | 505 | assert custom_parse('abcd 2004-2 efgh', OUTPUTFORMAT, True, MIN_DATE, LATEST_POSSIBLE) is not None
|
505 | 506 | assert custom_parse('abcd 2004-2 efgh', OUTPUTFORMAT, True, MIN_DATE, LATEST_POSSIBLE) is not None
|
506 | 507 | assert custom_parse('abcd 32. Januar 2020 efgh', OUTPUTFORMAT, True, MIN_DATE, LATEST_POSSIBLE) is None
|
| 508 | + # plausible but impossible dates |
| 509 | + assert custom_parse('February 29 2008', OUTPUTFORMAT, False, MIN_DATE, LATEST_POSSIBLE) == '2008-02-29' |
| 510 | + assert custom_parse('February 30 2008', OUTPUTFORMAT, False, MIN_DATE, LATEST_POSSIBLE) is None |
| 511 | + assert custom_parse('XXTag, den 29. Februar 2008', OUTPUTFORMAT, False, MIN_DATE, LATEST_POSSIBLE) == '2008-02-29' |
| 512 | + assert custom_parse('XXTag, den 30. Februar 2008', OUTPUTFORMAT, False, MIN_DATE, LATEST_POSSIBLE) is None |
507 | 513 | #for Nones caused by newlines and duplicates
|
508 | 514 | assert regex_parse("January 1st, 1998") is not None
|
509 | 515 | assert regex_parse("February 1st, 1998") is not None
|
@@ -633,6 +639,15 @@ def test_external_date_parser():
|
633 | 639 | '''test external date parser'''
|
634 | 640 | assert external_date_parser('Wednesday, January 1st 2020', OUTPUTFORMAT) == '2020-01-01'
|
635 | 641 | assert external_date_parser('Random text with 2020', OUTPUTFORMAT) is None
|
| 642 | + # https://github.com/scrapinghub/dateparser/issues/333 |
| 643 | + assert external_date_parser('1 January 0001', '%d %B %Y') == '01 January 1' |
| 644 | + # https://github.com/scrapinghub/dateparser/issues/406 |
| 645 | + assert external_date_parser('2018-04-12 17:20:03.12345678999a', OUTPUTFORMAT) == '2018-12-04' |
| 646 | + # https://github.com/scrapinghub/dateparser/issues/685 |
| 647 | + assert external_date_parser('12345678912 days', OUTPUTFORMAT) is None |
| 648 | + # https://github.com/scrapinghub/dateparser/issues/680 |
| 649 | + assert external_date_parser('2.2250738585072011e-308', OUTPUTFORMAT) is None |
| 650 | + assert external_date_parser('⁰⁴⁵₀₁₂', OUTPUTFORMAT) is None |
636 | 651 |
|
637 | 652 |
|
638 | 653 | def test_url():
|
|
0 commit comments