From 987d734aa943f4e08ca34eb81f759955c8ed7bae Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 8 Nov 2023 15:14:02 +0000 Subject: [PATCH 1/6] Starting to integrate latest azure pdf parser and pydantic. --- cli/parse_htmls.py | 6 +- cli/parse_no_content_type.py | 2 +- cli/parse_pdfs.py | 10 +- cli/run_parser.py | 4 +- cli/test/test_run_parser.py | 26 ++-- poetry.lock | 194 +++++++++++++++++++------- pyproject.toml | 4 +- src/html_parser/newsplease.py | 4 +- src/html_parser/readability.py | 2 +- src/html_parser/test/test_parsers.py | 2 +- src/translator/test/test_translate.py | 9 +- src/translator/translate.py | 2 +- 12 files changed, 182 insertions(+), 83 deletions(-) diff --git a/cli/parse_htmls.py b/cli/parse_htmls.py index c232d71..029ff82 100644 --- a/cli/parse_htmls.py +++ b/cli/parse_htmls.py @@ -48,7 +48,7 @@ def copy_input_to_output_html( pdf_data=None, ) - output_path.write_text(blank_output.json(indent=4, ensure_ascii=False)) + output_path.write_text(blank_output.model_dump_json(indent=4)) _LOGGER.info( "Blank html output saved.", @@ -84,7 +84,7 @@ def run_html_parser( if not output_path.exists(): copy_input_to_output_html(task, output_path) - existing_parser_output = ParserOutput.parse_raw(output_path.read_text()) + existing_parser_output = ParserOutput.model_validate_json(output_path.read_text()) # If no parsed html dta exists, assume we've not run before existing_html_data_exists = ( existing_parser_output.html_data is not None @@ -107,7 +107,7 @@ def run_html_parser( parsed_html = html_parser.parse(task).detect_and_set_languages() try: - output_path.write_text(parsed_html.json(indent=4, ensure_ascii=False)) + output_path.write_text(parsed_html.model_dump_json(indent=4)) except cloudpathlib.exceptions.OverwriteNewerCloudError as e: _LOGGER.error( "Attempted write to s3, received OverwriteNewerCloudError and therefore skipping.", diff --git a/cli/parse_no_content_type.py b/cli/parse_no_content_type.py index d242b0f..1e9d25c 100644 --- a/cli/parse_no_content_type.py +++ b/cli/parse_no_content_type.py @@ -43,7 +43,7 @@ def process_documents_with_no_content_type( output_path = output_dir / f"{task.document_id}.json" try: - output_path.write_text(output.json(indent=4, ensure_ascii=False)) + output_path.write_text(output.model_dump_json(indent=4)) except cloudpathlib.exceptions.OverwriteNewerCloudError as e: _LOGGER.error( "Attempted write to s3, received OverwriteNewerCloudError and therefore skipping.", diff --git a/cli/parse_pdfs.py b/cli/parse_pdfs.py index 6cca6f3..06b92bf 100644 --- a/cli/parse_pdfs.py +++ b/cli/parse_pdfs.py @@ -65,7 +65,7 @@ def copy_input_to_output_pdf( ) try: - output_path.write_text(blank_output.json(indent=4, ensure_ascii=False)) + output_path.write_text(blank_output.model_dump_json(indent=4)) _LOGGER.info( "Blank output saved.", extra={ @@ -246,7 +246,7 @@ def add_parser_metadata( def save_api_response( azure_cache_dir: Union[Path, S3Path, None], input_task: ParserInput, - api_response_array: [AnalyzeResult], + api_response_array: List[AnalyzeResult], ) -> None: """Cache the raw api responses as an array of json data.""" if azure_cache_dir: @@ -318,7 +318,7 @@ def parse_file( if not output_path.exists(): copy_input_to_output_pdf(input_task, output_path) - existing_parser_output = ParserOutput.parse_raw(output_path.read_text()) + existing_parser_output = ParserOutput.model_validate_json(output_path.read_text()) # If no parsed pdf data exists, assume we've not run before existing_pdf_data_exists = ( existing_parser_output.pdf_data is not None @@ -426,7 +426,7 @@ def parse_file( ) return None - document: ParserOutput = azure_api_response_to_parser_output( + document = azure_api_response_to_parser_output( parser_input=input_task, md5_sum=calculate_pdf_md5sum(str(pdf_path)), api_response=api_response, @@ -452,7 +452,7 @@ def parse_file( ) try: - output_path.write_text(document.json(indent=4, ensure_ascii=False)) + output_path.write_text(document.model_dump_json(indent=4)) except cloudpathlib.exceptions.OverwriteNewerCloudError as e: _LOGGER.error( "Attempted write to s3, received OverwriteNewerCloudError and " diff --git a/cli/run_parser.py b/cli/run_parser.py index 59c15f8..4a08fa9 100644 --- a/cli/run_parser.py +++ b/cli/run_parser.py @@ -197,8 +197,8 @@ def main( tasks = [] for path in files_to_parse: try: - tasks.append(ParserInput.parse_raw(path.read_text())) - except (pydantic.error_wrappers.ValidationError, KeyError) as e: + tasks.append(ParserInput.model_validate_json(path.read_text())) + except (pydantic.ValidationError, KeyError) as e: _LOGGER.error( "Failed to parse input file.", extra={ diff --git a/cli/test/test_run_parser.py b/cli/test/test_run_parser.py index 7c1d78b..f86c27c 100644 --- a/cli/test/test_run_parser.py +++ b/cli/test/test_run_parser.py @@ -82,7 +82,7 @@ def test_run_parser_local_parallel( } for output_file in Path(output_dir).glob("*.json"): - parser_output = ParserOutput.parse_file(output_file) + parser_output = ParserOutput.model_validate_json(output_file.read_text()) assert isinstance(parser_output, ParserOutput) if parser_output.document_content_type == CONTENT_TYPE_HTML: @@ -124,7 +124,7 @@ def test_run_parser_local_series(test_input_dir) -> None: } for output_file in Path(output_dir).glob("*.json"): - parser_output = ParserOutput.parse_file(output_file) + parser_output = ParserOutput.model_validate_json(output_file.read_text()) assert isinstance(parser_output, ParserOutput) if parser_output.document_content_type == CONTENT_TYPE_HTML: @@ -166,7 +166,7 @@ def test_run_parser_cache_azure_response_local( } for output_file in Path(output_dir).glob("*.json"): - parser_output = ParserOutput.parse_file(output_file) + parser_output = ParserOutput.model_validate_json(output_file.read_text()) assert isinstance(parser_output, ParserOutput) if parser_output.document_content_type == CONTENT_TYPE_HTML: @@ -296,7 +296,7 @@ def test_run_parser_skip_already_done(backend_document_json, caplog) -> None: with tempfile.TemporaryDirectory() as output_dir: with open(Path(output_dir) / "test_pdf.json", "w") as f: f.write( - ParserOutput.parse_obj( + ParserOutput.model_validate( { "document_id": "test_pdf", "document_metadata": backend_document_json, @@ -324,12 +324,12 @@ def test_run_parser_skip_already_done(backend_document_json, caplog) -> None: }, "html_data": None, } - ).json() + ).model_dump_json() ) with open(Path(output_dir) / "test_html.json", "w") as f: f.write( - ParserOutput.parse_obj( + ParserOutput.model_validate( { "document_id": "test_html", "document_metadata": backend_document_json, @@ -356,7 +356,7 @@ def test_run_parser_skip_already_done(backend_document_json, caplog) -> None: }, "pdf_data": None, } - ).json() + ).model_dump_json() ) runner = CliRunner() @@ -387,10 +387,10 @@ def get_parser_output( """Generate the parser output objects for the tests given input variables.""" return ParserOutput( document_id="sdf", - document_metadata=BackendDocument.parse_obj(document_metadata), + document_metadata=BackendDocument.model_validate(document_metadata), document_name="sdf", document_description="sdf", - document_source_url=source_url, + document_source_url=source_url, # type: ignore document_cdn_object="sdf", document_content_type="text/html", document_md5_sum="sdf", @@ -496,7 +496,7 @@ def test_fail_safely_on_azure_uncaught_exception( } for output_file in Path(output_dir).glob("*.json"): - parser_output = ParserOutput.parse_file(output_file) + parser_output = ParserOutput.model_validate_json(output_file.read_text()) assert isinstance(parser_output, ParserOutput) # Any html data should be parsed successfully as it is not using the azure @@ -548,7 +548,7 @@ def test_fail_safely_on_azure_service_request_error( } for output_file in Path(output_dir).glob("*.json"): - parser_output = ParserOutput.parse_file(output_file) + parser_output = ParserOutput.model_validate_json(output_file.read_text()) assert isinstance(parser_output, ParserOutput) # Any html data should be parsed successfully as it is not using the azure @@ -623,7 +623,7 @@ def test_fail_safely_on_azure_http_response_error( } for output_file in Path(output_dir).glob("*.json"): - parser_output = ParserOutput.parse_file(output_file) + parser_output = ParserOutput.model_validate_json(output_file.read_text()) assert isinstance(parser_output, ParserOutput) # Any html data should be parsed successfully as it is not using the @@ -735,7 +735,7 @@ def test_fail_safely_on_azure_http_response_error_large_doc( } for output_file in Path(output_dir).glob("*.json"): - parser_output = ParserOutput.parse_file(output_file) + parser_output = ParserOutput.model_validate_json(output_file.read_text()) assert isinstance(parser_output, ParserOutput) # Any html data should be parsed successfully as it is not using the diff --git a/poetry.lock b/poetry.lock index d8f1cc3..e2de302 100644 --- a/poetry.lock +++ b/poetry.lock @@ -132,6 +132,17 @@ files = [ [package.dependencies] frozenlist = ">=1.1.0" +[[package]] +name = "annotated-types" +version = "0.6.0" +description = "Reusable constraint types to use with typing.Annotated" +optional = false +python-versions = ">=3.8" +files = [ + {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, + {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, +] + [[package]] name = "async-timeout" version = "4.0.3" @@ -270,7 +281,7 @@ develop = false [package.dependencies] azure-ai-formrecognizer = "^3.2.1" -cpr-data-access = {git = "https://github.com/climatepolicyradar/data-access.git", tag = "v0.2.5"} +cpr-data-access = {git = "https://github.com/climatepolicyradar/data-access.git", rev = "7d7a3c05439532a1fdf473e12b7071ee918cf31c"} langdetect = "^1.0.9" pypdf = "^3.15.0" requests = "^2.31.0" @@ -278,8 +289,8 @@ requests = "^2.31.0" [package.source] type = "git" url = "https://github.com/climatepolicyradar/azure-pdf-parser.git" -reference = "v0.2.1" -resolved_reference = "08dc579494bd9b8a165ef7c9f5bc449eb654c94e" +reference = "7fe0aef5ba969b41783eaa34bcd51ef16d840204" +resolved_reference = "7fe0aef5ba969b41783eaa34bcd51ef16d840204" [[package]] name = "beautifulsoup4" @@ -698,7 +709,7 @@ files = [ [[package]] name = "cpr-data-access" -version = "0.2.3" +version = "0.2.9" description = "" optional = false python-versions = "^3.9" @@ -712,7 +723,7 @@ datasets = "^2.14.0" deprecation = "^2.1.0" langdetect = "^1.0.9" pandas = "^1.5.3" -pydantic = "^1.10.2" +pydantic = "^2.4.0" pyvespa = "^0.37.1" pyyaml = "^6.0.1" sentence-transformers = "^2.2.2" @@ -722,8 +733,8 @@ tqdm = "^4.64.1" [package.source] type = "git" url = "https://github.com/climatepolicyradar/data-access.git" -reference = "v0.2.5" -resolved_reference = "7a5c134c99941034caa44ff5f60e956be2f18067" +reference = "7d7a3c05439532a1fdf473e12b7071ee918cf31c" +resolved_reference = "7d7a3c05439532a1fdf473e12b7071ee918cf31c" [[package]] name = "cryptography" @@ -2863,55 +2874,140 @@ files = [ [[package]] name = "pydantic" -version = "1.10.13" -description = "Data validation and settings management using python type hints" +version = "2.4.2" +description = "Data validation using Python type hints" optional = false python-versions = ">=3.7" files = [ - {file = "pydantic-1.10.13-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:efff03cc7a4f29d9009d1c96ceb1e7a70a65cfe86e89d34e4a5f2ab1e5693737"}, - {file = "pydantic-1.10.13-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3ecea2b9d80e5333303eeb77e180b90e95eea8f765d08c3d278cd56b00345d01"}, - {file = "pydantic-1.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1740068fd8e2ef6eb27a20e5651df000978edce6da6803c2bef0bc74540f9548"}, - {file = "pydantic-1.10.13-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84bafe2e60b5e78bc64a2941b4c071a4b7404c5c907f5f5a99b0139781e69ed8"}, - {file = "pydantic-1.10.13-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:bc0898c12f8e9c97f6cd44c0ed70d55749eaf783716896960b4ecce2edfd2d69"}, - {file = "pydantic-1.10.13-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:654db58ae399fe6434e55325a2c3e959836bd17a6f6a0b6ca8107ea0571d2e17"}, - {file = "pydantic-1.10.13-cp310-cp310-win_amd64.whl", hash = "sha256:75ac15385a3534d887a99c713aa3da88a30fbd6204a5cd0dc4dab3d770b9bd2f"}, - {file = "pydantic-1.10.13-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c553f6a156deb868ba38a23cf0df886c63492e9257f60a79c0fd8e7173537653"}, - {file = "pydantic-1.10.13-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5e08865bc6464df8c7d61439ef4439829e3ab62ab1669cddea8dd00cd74b9ffe"}, - {file = "pydantic-1.10.13-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e31647d85a2013d926ce60b84f9dd5300d44535a9941fe825dc349ae1f760df9"}, - {file = "pydantic-1.10.13-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:210ce042e8f6f7c01168b2d84d4c9eb2b009fe7bf572c2266e235edf14bacd80"}, - {file = "pydantic-1.10.13-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:8ae5dd6b721459bfa30805f4c25880e0dd78fc5b5879f9f7a692196ddcb5a580"}, - {file = "pydantic-1.10.13-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f8e81fc5fb17dae698f52bdd1c4f18b6ca674d7068242b2aff075f588301bbb0"}, - {file = "pydantic-1.10.13-cp311-cp311-win_amd64.whl", hash = "sha256:61d9dce220447fb74f45e73d7ff3b530e25db30192ad8d425166d43c5deb6df0"}, - {file = "pydantic-1.10.13-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4b03e42ec20286f052490423682016fd80fda830d8e4119f8ab13ec7464c0132"}, - {file = "pydantic-1.10.13-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f59ef915cac80275245824e9d771ee939133be38215555e9dc90c6cb148aaeb5"}, - {file = "pydantic-1.10.13-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a1f9f747851338933942db7af7b6ee8268568ef2ed86c4185c6ef4402e80ba8"}, - {file = "pydantic-1.10.13-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:97cce3ae7341f7620a0ba5ef6cf043975cd9d2b81f3aa5f4ea37928269bc1b87"}, - {file = "pydantic-1.10.13-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:854223752ba81e3abf663d685f105c64150873cc6f5d0c01d3e3220bcff7d36f"}, - {file = "pydantic-1.10.13-cp37-cp37m-win_amd64.whl", hash = "sha256:b97c1fac8c49be29486df85968682b0afa77e1b809aff74b83081cc115e52f33"}, - {file = "pydantic-1.10.13-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c958d053453a1c4b1c2062b05cd42d9d5c8eb67537b8d5a7e3c3032943ecd261"}, - {file = "pydantic-1.10.13-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4c5370a7edaac06daee3af1c8b1192e305bc102abcbf2a92374b5bc793818599"}, - {file = "pydantic-1.10.13-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d6f6e7305244bddb4414ba7094ce910560c907bdfa3501e9db1a7fd7eaea127"}, - {file = "pydantic-1.10.13-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3a3c792a58e1622667a2837512099eac62490cdfd63bd407993aaf200a4cf1f"}, - {file = "pydantic-1.10.13-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:c636925f38b8db208e09d344c7aa4f29a86bb9947495dd6b6d376ad10334fb78"}, - {file = "pydantic-1.10.13-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:678bcf5591b63cc917100dc50ab6caebe597ac67e8c9ccb75e698f66038ea953"}, - {file = "pydantic-1.10.13-cp38-cp38-win_amd64.whl", hash = "sha256:6cf25c1a65c27923a17b3da28a0bdb99f62ee04230c931d83e888012851f4e7f"}, - {file = "pydantic-1.10.13-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8ef467901d7a41fa0ca6db9ae3ec0021e3f657ce2c208e98cd511f3161c762c6"}, - {file = "pydantic-1.10.13-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:968ac42970f57b8344ee08837b62f6ee6f53c33f603547a55571c954a4225691"}, - {file = "pydantic-1.10.13-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9849f031cf8a2f0a928fe885e5a04b08006d6d41876b8bbd2fc68a18f9f2e3fd"}, - {file = "pydantic-1.10.13-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:56e3ff861c3b9c6857579de282ce8baabf443f42ffba355bf070770ed63e11e1"}, - {file = "pydantic-1.10.13-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f00790179497767aae6bcdc36355792c79e7bbb20b145ff449700eb076c5f96"}, - {file = "pydantic-1.10.13-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:75b297827b59bc229cac1a23a2f7a4ac0031068e5be0ce385be1462e7e17a35d"}, - {file = "pydantic-1.10.13-cp39-cp39-win_amd64.whl", hash = "sha256:e70ca129d2053fb8b728ee7d1af8e553a928d7e301a311094b8a0501adc8763d"}, - {file = "pydantic-1.10.13-py3-none-any.whl", hash = "sha256:b87326822e71bd5f313e7d3bfdc77ac3247035ac10b0c0618bd99dcf95b1e687"}, - {file = "pydantic-1.10.13.tar.gz", hash = "sha256:32c8b48dcd3b2ac4e78b0ba4af3a2c2eb6048cb75202f0ea7b34feb740efc340"}, + {file = "pydantic-2.4.2-py3-none-any.whl", hash = "sha256:bc3ddf669d234f4220e6e1c4d96b061abe0998185a8d7855c0126782b7abc8c1"}, + {file = "pydantic-2.4.2.tar.gz", hash = "sha256:94f336138093a5d7f426aac732dcfe7ab4eb4da243c88f891d65deb4a2556ee7"}, ] [package.dependencies] -typing-extensions = ">=4.2.0" +annotated-types = ">=0.4.0" +pydantic-core = "2.10.1" +typing-extensions = ">=4.6.1" [package.extras] -dotenv = ["python-dotenv (>=0.10.4)"] -email = ["email-validator (>=1.0.3)"] +email = ["email-validator (>=2.0.0)"] + +[[package]] +name = "pydantic-core" +version = "2.10.1" +description = "" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pydantic_core-2.10.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:d64728ee14e667ba27c66314b7d880b8eeb050e58ffc5fec3b7a109f8cddbd63"}, + {file = "pydantic_core-2.10.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:48525933fea744a3e7464c19bfede85df4aba79ce90c60b94d8b6e1eddd67096"}, + {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef337945bbd76cce390d1b2496ccf9f90b1c1242a3a7bc242ca4a9fc5993427a"}, + {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1392e0638af203cee360495fd2cfdd6054711f2db5175b6e9c3c461b76f5175"}, + {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0675ba5d22de54d07bccde38997e780044dcfa9a71aac9fd7d4d7a1d2e3e65f7"}, + {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:128552af70a64660f21cb0eb4876cbdadf1a1f9d5de820fed6421fa8de07c893"}, + {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f6e6aed5818c264412ac0598b581a002a9f050cb2637a84979859e70197aa9e"}, + {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ecaac27da855b8d73f92123e5f03612b04c5632fd0a476e469dfc47cd37d6b2e"}, + {file = "pydantic_core-2.10.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b3c01c2fb081fced3bbb3da78510693dc7121bb893a1f0f5f4b48013201f362e"}, + {file = "pydantic_core-2.10.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:92f675fefa977625105708492850bcbc1182bfc3e997f8eecb866d1927c98ae6"}, + {file = "pydantic_core-2.10.1-cp310-none-win32.whl", hash = "sha256:420a692b547736a8d8703c39ea935ab5d8f0d2573f8f123b0a294e49a73f214b"}, + {file = "pydantic_core-2.10.1-cp310-none-win_amd64.whl", hash = "sha256:0880e239827b4b5b3e2ce05e6b766a7414e5f5aedc4523be6b68cfbc7f61c5d0"}, + {file = "pydantic_core-2.10.1-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:073d4a470b195d2b2245d0343569aac7e979d3a0dcce6c7d2af6d8a920ad0bea"}, + {file = "pydantic_core-2.10.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:600d04a7b342363058b9190d4e929a8e2e715c5682a70cc37d5ded1e0dd370b4"}, + {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39215d809470f4c8d1881758575b2abfb80174a9e8daf8f33b1d4379357e417c"}, + {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eeb3d3d6b399ffe55f9a04e09e635554012f1980696d6b0aca3e6cf42a17a03b"}, + {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7a7902bf75779bc12ccfc508bfb7a4c47063f748ea3de87135d433a4cca7a2f"}, + {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3625578b6010c65964d177626fde80cf60d7f2e297d56b925cb5cdeda6e9925a"}, + {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:caa48fc31fc7243e50188197b5f0c4228956f97b954f76da157aae7f67269ae8"}, + {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:07ec6d7d929ae9c68f716195ce15e745b3e8fa122fc67698ac6498d802ed0fa4"}, + {file = "pydantic_core-2.10.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e6f31a17acede6a8cd1ae2d123ce04d8cca74056c9d456075f4f6f85de055607"}, + {file = "pydantic_core-2.10.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8f1ebca515a03e5654f88411420fea6380fc841d1bea08effb28184e3d4899f"}, + {file = "pydantic_core-2.10.1-cp311-none-win32.whl", hash = "sha256:6db2eb9654a85ada248afa5a6db5ff1cf0f7b16043a6b070adc4a5be68c716d6"}, + {file = "pydantic_core-2.10.1-cp311-none-win_amd64.whl", hash = "sha256:4a5be350f922430997f240d25f8219f93b0c81e15f7b30b868b2fddfc2d05f27"}, + {file = "pydantic_core-2.10.1-cp311-none-win_arm64.whl", hash = "sha256:5fdb39f67c779b183b0c853cd6b45f7db84b84e0571b3ef1c89cdb1dfc367325"}, + {file = "pydantic_core-2.10.1-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:b1f22a9ab44de5f082216270552aa54259db20189e68fc12484873d926426921"}, + {file = "pydantic_core-2.10.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8572cadbf4cfa95fb4187775b5ade2eaa93511f07947b38f4cd67cf10783b118"}, + {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db9a28c063c7c00844ae42a80203eb6d2d6bbb97070cfa00194dff40e6f545ab"}, + {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0e2a35baa428181cb2270a15864ec6286822d3576f2ed0f4cd7f0c1708472aff"}, + {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05560ab976012bf40f25d5225a58bfa649bb897b87192a36c6fef1ab132540d7"}, + {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d6495008733c7521a89422d7a68efa0a0122c99a5861f06020ef5b1f51f9ba7c"}, + {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14ac492c686defc8e6133e3a2d9eaf5261b3df26b8ae97450c1647286750b901"}, + {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8282bab177a9a3081fd3d0a0175a07a1e2bfb7fcbbd949519ea0980f8a07144d"}, + {file = "pydantic_core-2.10.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:aafdb89fdeb5fe165043896817eccd6434aee124d5ee9b354f92cd574ba5e78f"}, + {file = "pydantic_core-2.10.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f6defd966ca3b187ec6c366604e9296f585021d922e666b99c47e78738b5666c"}, + {file = "pydantic_core-2.10.1-cp312-none-win32.whl", hash = "sha256:7c4d1894fe112b0864c1fa75dffa045720a194b227bed12f4be7f6045b25209f"}, + {file = "pydantic_core-2.10.1-cp312-none-win_amd64.whl", hash = "sha256:5994985da903d0b8a08e4935c46ed8daf5be1cf217489e673910951dc533d430"}, + {file = "pydantic_core-2.10.1-cp312-none-win_arm64.whl", hash = "sha256:0d8a8adef23d86d8eceed3e32e9cca8879c7481c183f84ed1a8edc7df073af94"}, + {file = "pydantic_core-2.10.1-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:9badf8d45171d92387410b04639d73811b785b5161ecadabf056ea14d62d4ede"}, + {file = "pydantic_core-2.10.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:ebedb45b9feb7258fac0a268a3f6bec0a2ea4d9558f3d6f813f02ff3a6dc6698"}, + {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfe1090245c078720d250d19cb05d67e21a9cd7c257698ef139bc41cf6c27b4f"}, + {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e357571bb0efd65fd55f18db0a2fb0ed89d0bb1d41d906b138f088933ae618bb"}, + {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b3dcd587b69bbf54fc04ca157c2323b8911033e827fffaecf0cafa5a892a0904"}, + {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c120c9ce3b163b985a3b966bb701114beb1da4b0468b9b236fc754783d85aa3"}, + {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15d6bca84ffc966cc9976b09a18cf9543ed4d4ecbd97e7086f9ce9327ea48891"}, + {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5cabb9710f09d5d2e9e2748c3e3e20d991a4c5f96ed8f1132518f54ab2967221"}, + {file = "pydantic_core-2.10.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:82f55187a5bebae7d81d35b1e9aaea5e169d44819789837cdd4720d768c55d15"}, + {file = "pydantic_core-2.10.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:1d40f55222b233e98e3921df7811c27567f0e1a4411b93d4c5c0f4ce131bc42f"}, + {file = "pydantic_core-2.10.1-cp37-none-win32.whl", hash = "sha256:14e09ff0b8fe6e46b93d36a878f6e4a3a98ba5303c76bb8e716f4878a3bee92c"}, + {file = "pydantic_core-2.10.1-cp37-none-win_amd64.whl", hash = "sha256:1396e81b83516b9d5c9e26a924fa69164156c148c717131f54f586485ac3c15e"}, + {file = "pydantic_core-2.10.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:6835451b57c1b467b95ffb03a38bb75b52fb4dc2762bb1d9dbed8de31ea7d0fc"}, + {file = "pydantic_core-2.10.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b00bc4619f60c853556b35f83731bd817f989cba3e97dc792bb8c97941b8053a"}, + {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fa467fd300a6f046bdb248d40cd015b21b7576c168a6bb20aa22e595c8ffcdd"}, + {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d99277877daf2efe074eae6338453a4ed54a2d93fb4678ddfe1209a0c93a2468"}, + {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fa7db7558607afeccb33c0e4bf1c9a9a835e26599e76af6fe2fcea45904083a6"}, + {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aad7bd686363d1ce4ee930ad39f14e1673248373f4a9d74d2b9554f06199fb58"}, + {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:443fed67d33aa85357464f297e3d26e570267d1af6fef1c21ca50921d2976302"}, + {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:042462d8d6ba707fd3ce9649e7bf268633a41018d6a998fb5fbacb7e928a183e"}, + {file = "pydantic_core-2.10.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ecdbde46235f3d560b18be0cb706c8e8ad1b965e5c13bbba7450c86064e96561"}, + {file = "pydantic_core-2.10.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ed550ed05540c03f0e69e6d74ad58d026de61b9eaebebbaaf8873e585cbb18de"}, + {file = "pydantic_core-2.10.1-cp38-none-win32.whl", hash = "sha256:8cdbbd92154db2fec4ec973d45c565e767ddc20aa6dbaf50142676484cbff8ee"}, + {file = "pydantic_core-2.10.1-cp38-none-win_amd64.whl", hash = "sha256:9f6f3e2598604956480f6c8aa24a3384dbf6509fe995d97f6ca6103bb8c2534e"}, + {file = "pydantic_core-2.10.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:655f8f4c8d6a5963c9a0687793da37b9b681d9ad06f29438a3b2326d4e6b7970"}, + {file = "pydantic_core-2.10.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e570ffeb2170e116a5b17e83f19911020ac79d19c96f320cbfa1fa96b470185b"}, + {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64322bfa13e44c6c30c518729ef08fda6026b96d5c0be724b3c4ae4da939f875"}, + {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:485a91abe3a07c3a8d1e082ba29254eea3e2bb13cbbd4351ea4e5a21912cc9b0"}, + {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7c2b8eb9fc872e68b46eeaf835e86bccc3a58ba57d0eedc109cbb14177be531"}, + {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a5cb87bdc2e5f620693148b5f8f842d293cae46c5f15a1b1bf7ceeed324a740c"}, + {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25bd966103890ccfa028841a8f30cebcf5875eeac8c4bde4fe221364c92f0c9a"}, + {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f323306d0556351735b54acbf82904fe30a27b6a7147153cbe6e19aaaa2aa429"}, + {file = "pydantic_core-2.10.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0c27f38dc4fbf07b358b2bc90edf35e82d1703e22ff2efa4af4ad5de1b3833e7"}, + {file = "pydantic_core-2.10.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f1365e032a477c1430cfe0cf2856679529a2331426f8081172c4a74186f1d595"}, + {file = "pydantic_core-2.10.1-cp39-none-win32.whl", hash = "sha256:a1c311fd06ab3b10805abb72109f01a134019739bd3286b8ae1bc2fc4e50c07a"}, + {file = "pydantic_core-2.10.1-cp39-none-win_amd64.whl", hash = "sha256:ae8a8843b11dc0b03b57b52793e391f0122e740de3df1474814c700d2622950a"}, + {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:d43002441932f9a9ea5d6f9efaa2e21458221a3a4b417a14027a1d530201ef1b"}, + {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fcb83175cc4936a5425dde3356f079ae03c0802bbdf8ff82c035f8a54b333521"}, + {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:962ed72424bf1f72334e2f1e61b68f16c0e596f024ca7ac5daf229f7c26e4208"}, + {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2cf5bb4dd67f20f3bbc1209ef572a259027c49e5ff694fa56bed62959b41e1f9"}, + {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e544246b859f17373bed915182ab841b80849ed9cf23f1f07b73b7c58baee5fb"}, + {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:c0877239307b7e69d025b73774e88e86ce82f6ba6adf98f41069d5b0b78bd1bf"}, + {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:53df009d1e1ba40f696f8995683e067e3967101d4bb4ea6f667931b7d4a01357"}, + {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a1254357f7e4c82e77c348dabf2d55f1d14d19d91ff025004775e70a6ef40ada"}, + {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:524ff0ca3baea164d6d93a32c58ac79eca9f6cf713586fdc0adb66a8cdeab96a"}, + {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f0ac9fb8608dbc6eaf17956bf623c9119b4db7dbb511650910a82e261e6600f"}, + {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:320f14bd4542a04ab23747ff2c8a778bde727158b606e2661349557f0770711e"}, + {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:63974d168b6233b4ed6a0046296803cb13c56637a7b8106564ab575926572a55"}, + {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:417243bf599ba1f1fef2bb8c543ceb918676954734e2dcb82bf162ae9d7bd514"}, + {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:dda81e5ec82485155a19d9624cfcca9be88a405e2857354e5b089c2a982144b2"}, + {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:14cfbb00959259e15d684505263d5a21732b31248a5dd4941f73a3be233865b9"}, + {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:631cb7415225954fdcc2a024119101946793e5923f6c4d73a5914d27eb3d3a05"}, + {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:bec7dd208a4182e99c5b6c501ce0b1f49de2802448d4056091f8e630b28e9a52"}, + {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:149b8a07712f45b332faee1a2258d8ef1fb4a36f88c0c17cb687f205c5dc6e7d"}, + {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d966c47f9dd73c2d32a809d2be529112d509321c5310ebf54076812e6ecd884"}, + {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7eb037106f5c6b3b0b864ad226b0b7ab58157124161d48e4b30c4a43fef8bc4b"}, + {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:154ea7c52e32dce13065dbb20a4a6f0cc012b4f667ac90d648d36b12007fa9f7"}, + {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e562617a45b5a9da5be4abe72b971d4f00bf8555eb29bb91ec2ef2be348cd132"}, + {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:f23b55eb5464468f9e0e9a9935ce3ed2a870608d5f534025cd5536bca25b1402"}, + {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:e9121b4009339b0f751955baf4543a0bfd6bc3f8188f8056b1a25a2d45099934"}, + {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:0523aeb76e03f753b58be33b26540880bac5aa54422e4462404c432230543f33"}, + {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e0e2959ef5d5b8dc9ef21e1a305a21a36e254e6a34432d00c72a92fdc5ecda5"}, + {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da01bec0a26befab4898ed83b362993c844b9a607a86add78604186297eb047e"}, + {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f2e9072d71c1f6cfc79a36d4484c82823c560e6f5599c43c1ca6b5cdbd54f881"}, + {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f36a3489d9e28fe4b67be9992a23029c3cec0babc3bd9afb39f49844a8c721c5"}, + {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f64f82cc3443149292b32387086d02a6c7fb39b8781563e0ca7b8d7d9cf72bd7"}, + {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b4a6db486ac8e99ae696e09efc8b2b9fea67b63c8f88ba7a1a16c24a057a0776"}, + {file = "pydantic_core-2.10.1.tar.gz", hash = "sha256:0f8682dbdd2f67f8e1edddcbffcc29f60a6182b4901c367fc8c1c40d30bb0a82"}, +] + +[package.dependencies] +typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" [[package]] name = "pydispatcher" @@ -4707,4 +4803,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "~3.9" -content-hash = "2a5acb8ef1defd96d1307287a26ef30d0daf4152f4bb59dbf829c0c9280ebb65" +content-hash = "ef2620eec803ce51b4fd1072715a0ab565384caab6725f69e868e2bea6ffb1a2" diff --git a/pyproject.toml b/pyproject.toml index b9263fd..fb561bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ awscli = "^1.26.16" news-please = "^1.5.22" pandas = "^1.4.4" tqdm = "^4.64.1" -pydantic = "^1.10.2" +pydantic = "^2.4.0" click = "^8.1.3" langdetect = "^1.0.9" playwright = "^1.35.0" @@ -29,7 +29,7 @@ azure-ai-formrecognizer = "^3.2.1" pytest = "^7.4.0" mock = "^5.1.0" pypdf2 = "^3.0.1" -azure-pdf-parser = {git = "https://github.com/climatepolicyradar/azure-pdf-parser.git", tag = "v0.2.1"} +azure-pdf-parser = {git = "https://github.com/climatepolicyradar/azure-pdf-parser.git", rev = "7fe0aef5ba969b41783eaa34bcd51ef16d840204"} [tool.poetry.dev-dependencies] pre-commit = "^2.20.0" diff --git a/src/html_parser/newsplease.py b/src/html_parser/newsplease.py index 52ecf40..e9e43c5 100644 --- a/src/html_parser/newsplease.py +++ b/src/html_parser/newsplease.py @@ -49,7 +49,7 @@ def parse_html(self, html: str, input: ParserInput) -> ParserOutput: ) except Exception as e: _LOGGER.exception( - "Failed to parse document.{input.document_source_url} for {input.document_id}", + f"Failed to parse document.{input.document_source_url} for {input.document_id}", extra={ "document_id": input.document_id, "source_url": input.document_source_url, @@ -120,7 +120,7 @@ def _newsplease_article_to_parsed_html( has_valid_text = len(text_by_line) >= HTML_MIN_NO_LINES_FOR_VALID_TEXT text_blocks = [ - HTMLTextBlock.parse_obj( + HTMLTextBlock.model_validate( { "text_block_id": f"b{idx}", "text": [text], diff --git a/src/html_parser/readability.py b/src/html_parser/readability.py index a2e8dd7..eeaff8f 100644 --- a/src/html_parser/readability.py +++ b/src/html_parser/readability.py @@ -90,7 +90,7 @@ def parse_html(self, html: str, input: ParserInput) -> ParserOutput: has_valid_text = len(text_by_line) >= HTML_MIN_NO_LINES_FOR_VALID_TEXT text_blocks = [ - HTMLTextBlock.parse_obj( + HTMLTextBlock.model_validate( { "text_block_id": f"b{idx}", "text": [text], diff --git a/src/html_parser/test/test_parsers.py b/src/html_parser/test/test_parsers.py index 2b72573..ffe6f2c 100644 --- a/src/html_parser/test/test_parsers.py +++ b/src/html_parser/test/test_parsers.py @@ -26,7 +26,7 @@ def test_parse(url: str, parser: HTMLParser) -> None: :param parser: HTML parser """ - input = ParserInput.parse_obj( + input = ParserInput.model_validate( { "document_id": "test_id", "document_metadata": { diff --git a/src/translator/test/test_translate.py b/src/translator/test/test_translate.py index 0703cbe..220a765 100644 --- a/src/translator/test/test_translate.py +++ b/src/translator/test/test_translate.py @@ -18,15 +18,18 @@ def test_translate_parser_output() -> None: with mock.patch( "src.translator.translate.translate_text", wraps=fake_translate_text, - ): - parser_output = ParserOutput.parse_file( - Path(__file__).parent.parent.parent.parent + ): + test_file_path = (Path(__file__).parent.parent.parent.parent / "cli" / "test" / "test_data" / "output" / "test_html.json" ) + + parser_output = ParserOutput.model_validate_json( + test_file_path.read_text() + ) translated_parser_output = translate_parser_output(parser_output, "fr") diff --git a/src/translator/translate.py b/src/translator/translate.py index 8720165..7d396b5 100644 --- a/src/translator/translate.py +++ b/src/translator/translate.py @@ -62,7 +62,7 @@ def translate_parser_output( """ # A deep copy here prevents text blocks in the original ParserOutput object from being modified in place - new_parser_output = parser_output.copy(deep=True) + new_parser_output = parser_output.model_copy(deep=True) # Translate document name, document description and text new_parser_output.document_name = translate_text( From 22be198c1b08d55582d4528e8f68dad3744c4be8 Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 8 Nov 2023 15:39:00 +0000 Subject: [PATCH 2/6] Minor bug fix. --- cli/translate_outputs.py | 2 +- src/html_parser/newsplease.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cli/translate_outputs.py b/cli/translate_outputs.py index 7d9f645..892e4ed 100644 --- a/cli/translate_outputs.py +++ b/cli/translate_outputs.py @@ -164,7 +164,7 @@ def _translate_to_target_languages( try: output_path.write_text( # type: ignore - translated_parser_output.json(indent=4, ensure_ascii=False) + translated_parser_output.model_dump_json(indent=4) ) _LOGGER.info( "Saved translated output for document.", diff --git a/src/html_parser/newsplease.py b/src/html_parser/newsplease.py index e9e43c5..226ed83 100644 --- a/src/html_parser/newsplease.py +++ b/src/html_parser/newsplease.py @@ -49,10 +49,11 @@ def parse_html(self, html: str, input: ParserInput) -> ParserOutput: ) except Exception as e: _LOGGER.exception( - f"Failed to parse document.{input.document_source_url} for {input.document_id}", + f"Failed to parse {input.document_source_url} for {input.document_id}", extra={ "document_id": input.document_id, "source_url": input.document_source_url, + "html": html, "error_message": e, }, ) From 0a87d3f63f3341ac16564365545ed45f334ff0b6 Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 8 Nov 2023 16:11:59 +0000 Subject: [PATCH 3/6] Refactoring. --- cli/parse_htmls.py | 4 +++- cli/test/test_run_parser.py | 10 +++++++--- cli/translate_outputs.py | 2 +- src/html_parser/newsplease.py | 2 +- src/translator/test/test_translate.py | 11 +++++------ 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/cli/parse_htmls.py b/cli/parse_htmls.py index 029ff82..7604854 100644 --- a/cli/parse_htmls.py +++ b/cli/parse_htmls.py @@ -84,7 +84,9 @@ def run_html_parser( if not output_path.exists(): copy_input_to_output_html(task, output_path) - existing_parser_output = ParserOutput.model_validate_json(output_path.read_text()) + existing_parser_output = ParserOutput.model_validate_json( + output_path.read_text() + ) # If no parsed html dta exists, assume we've not run before existing_html_data_exists = ( existing_parser_output.html_data is not None diff --git a/cli/test/test_run_parser.py b/cli/test/test_run_parser.py index f86c27c..2657cab 100644 --- a/cli/test/test_run_parser.py +++ b/cli/test/test_run_parser.py @@ -390,7 +390,7 @@ def get_parser_output( document_metadata=BackendDocument.model_validate(document_metadata), document_name="sdf", document_description="sdf", - document_source_url=source_url, # type: ignore + document_source_url=source_url, # type: ignore document_cdn_object="sdf", document_content_type="text/html", document_md5_sum="sdf", @@ -623,7 +623,9 @@ def test_fail_safely_on_azure_http_response_error( } for output_file in Path(output_dir).glob("*.json"): - parser_output = ParserOutput.model_validate_json(output_file.read_text()) + parser_output = ParserOutput.model_validate_json( + output_file.read_text() + ) assert isinstance(parser_output, ParserOutput) # Any html data should be parsed successfully as it is not using the @@ -735,7 +737,9 @@ def test_fail_safely_on_azure_http_response_error_large_doc( } for output_file in Path(output_dir).glob("*.json"): - parser_output = ParserOutput.model_validate_json(output_file.read_text()) + parser_output = ParserOutput.model_validate_json( + output_file.read_text() + ) assert isinstance(parser_output, ParserOutput) # Any html data should be parsed successfully as it is not using the diff --git a/cli/translate_outputs.py b/cli/translate_outputs.py index 892e4ed..a14a6f6 100644 --- a/cli/translate_outputs.py +++ b/cli/translate_outputs.py @@ -65,7 +65,7 @@ def translate_parser_outputs( ) try: - parser_output = ParserOutput.parse_raw(path.read_text()) + parser_output = ParserOutput.model_validate_json(path.read_text()) _LOGGER.debug( "Successfully parsed document from output dir during translation processing.", extra={"props": {"path": f"{path}"}}, diff --git a/src/html_parser/newsplease.py b/src/html_parser/newsplease.py index 226ed83..4c400aa 100644 --- a/src/html_parser/newsplease.py +++ b/src/html_parser/newsplease.py @@ -53,7 +53,7 @@ def parse_html(self, html: str, input: ParserInput) -> ParserOutput: extra={ "document_id": input.document_id, "source_url": input.document_source_url, - "html": html, + "html": html, "error_message": e, }, ) diff --git a/src/translator/test/test_translate.py b/src/translator/test/test_translate.py index 220a765..80d2a78 100644 --- a/src/translator/test/test_translate.py +++ b/src/translator/test/test_translate.py @@ -18,18 +18,17 @@ def test_translate_parser_output() -> None: with mock.patch( "src.translator.translate.translate_text", wraps=fake_translate_text, - ): - test_file_path = (Path(__file__).parent.parent.parent.parent + ): + test_file_path = ( + Path(__file__).parent.parent.parent.parent / "cli" / "test" / "test_data" / "output" / "test_html.json" ) - - parser_output = ParserOutput.model_validate_json( - test_file_path.read_text() - ) + + parser_output = ParserOutput.model_validate_json(test_file_path.read_text()) translated_parser_output = translate_parser_output(parser_output, "fr") From a3b1c0bb6bd8f2dbbaaef2a21f0952d875313abb Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 8 Nov 2023 16:35:51 +0000 Subject: [PATCH 4/6] Debugging test. --- src/html_parser/newsplease.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/html_parser/newsplease.py b/src/html_parser/newsplease.py index 4c400aa..c41c377 100644 --- a/src/html_parser/newsplease.py +++ b/src/html_parser/newsplease.py @@ -45,7 +45,7 @@ def parse_html(self, html: str, input: ParserInput) -> ParserOutput: ) article = NewsPlease.from_html( - html=html, url=input.document_source_url, fetch_images=False + html=html, url=str(input.document_source_url), fetch_images=False ) except Exception as e: _LOGGER.exception( @@ -53,7 +53,6 @@ def parse_html(self, html: str, input: ParserInput) -> ParserOutput: extra={ "document_id": input.document_id, "source_url": input.document_source_url, - "html": html, "error_message": e, }, ) From 7b90b271099c1cc04f03a93cd8fee7b27e3ac97e Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 9 Nov 2023 13:22:12 +0000 Subject: [PATCH 5/6] Adding semver tag for azure pdf parser dependency. --- poetry.lock | 12 ++++++------ pyproject.toml | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index e2de302..3fbe78a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -281,7 +281,7 @@ develop = false [package.dependencies] azure-ai-formrecognizer = "^3.2.1" -cpr-data-access = {git = "https://github.com/climatepolicyradar/data-access.git", rev = "7d7a3c05439532a1fdf473e12b7071ee918cf31c"} +cpr-data-access = {git = "https://github.com/climatepolicyradar/data-access.git", tag = "0.3.0"} langdetect = "^1.0.9" pypdf = "^3.15.0" requests = "^2.31.0" @@ -289,8 +289,8 @@ requests = "^2.31.0" [package.source] type = "git" url = "https://github.com/climatepolicyradar/azure-pdf-parser.git" -reference = "7fe0aef5ba969b41783eaa34bcd51ef16d840204" -resolved_reference = "7fe0aef5ba969b41783eaa34bcd51ef16d840204" +reference = "v0.3.0" +resolved_reference = "cb19f3b5ffedda10ee7574c9d9bb907d08be343e" [[package]] name = "beautifulsoup4" @@ -733,8 +733,8 @@ tqdm = "^4.64.1" [package.source] type = "git" url = "https://github.com/climatepolicyradar/data-access.git" -reference = "7d7a3c05439532a1fdf473e12b7071ee918cf31c" -resolved_reference = "7d7a3c05439532a1fdf473e12b7071ee918cf31c" +reference = "0.3.0" +resolved_reference = "207881edae5ff647d0573ca8e07b3c8678350131" [[package]] name = "cryptography" @@ -4803,4 +4803,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "~3.9" -content-hash = "ef2620eec803ce51b4fd1072715a0ab565384caab6725f69e868e2bea6ffb1a2" +content-hash = "57d58b06c5f4fae5956da1e3d175daee0a767567c4117d3b47f7ccfdd95b9788" diff --git a/pyproject.toml b/pyproject.toml index fb561bd..d197883 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ azure-ai-formrecognizer = "^3.2.1" pytest = "^7.4.0" mock = "^5.1.0" pypdf2 = "^3.0.1" -azure-pdf-parser = {git = "https://github.com/climatepolicyradar/azure-pdf-parser.git", rev = "7fe0aef5ba969b41783eaa34bcd51ef16d840204"} +azure-pdf-parser = {git = "https://github.com/climatepolicyradar/azure-pdf-parser.git", tag = "v0.3.0"} [tool.poetry.dev-dependencies] pre-commit = "^2.20.0" From b8c77d228cb44d04e8575c345b48e03fa6e98e2b Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 9 Nov 2023 14:15:51 +0000 Subject: [PATCH 6/6] Updating type. --- cli/test/test_run_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cli/test/test_run_parser.py b/cli/test/test_run_parser.py index 2657cab..77af057 100644 --- a/cli/test/test_run_parser.py +++ b/cli/test/test_run_parser.py @@ -19,6 +19,7 @@ from azure_pdf_parser.base import PDFPagesBatchExtracted from azure.ai.formrecognizer import AnalyzeResult from mock import patch +from pydantic import AnyHttpUrl from cli.run_parser import main as cli_main from cli.translate_outputs import should_be_translated, identify_translation_languages @@ -390,7 +391,7 @@ def get_parser_output( document_metadata=BackendDocument.model_validate(document_metadata), document_name="sdf", document_description="sdf", - document_source_url=source_url, # type: ignore + document_source_url=AnyHttpUrl(source_url) if source_url else None, document_cdn_object="sdf", document_content_type="text/html", document_md5_sum="sdf",