Starting to integrate latest azure pdf parser and pydantic.

climatepolicyradar · Nov 8, 2023 · 987d734 · 987d734
1 parent 95ecfd4
commit 987d734
Show file tree

Hide file tree

Showing 12 changed files with 182 additions and 83 deletions.
diff --git a/cli/parse_htmls.py b/cli/parse_htmls.py
@@ -48,7 +48,7 @@ def copy_input_to_output_html(
         pdf_data=None,
     )
 
-    output_path.write_text(blank_output.json(indent=4, ensure_ascii=False))
+    output_path.write_text(blank_output.model_dump_json(indent=4))
 
     _LOGGER.info(
         "Blank html output saved.",
@@ -84,7 +84,7 @@ def run_html_parser(
             if not output_path.exists():
                 copy_input_to_output_html(task, output_path)
 
-            existing_parser_output = ParserOutput.parse_raw(output_path.read_text())
+            existing_parser_output = ParserOutput.model_validate_json(output_path.read_text())
             # If no parsed html dta exists, assume we've not run before
             existing_html_data_exists = (
                 existing_parser_output.html_data is not None
@@ -107,7 +107,7 @@ def run_html_parser(
             parsed_html = html_parser.parse(task).detect_and_set_languages()
 
             try:
-                output_path.write_text(parsed_html.json(indent=4, ensure_ascii=False))
+                output_path.write_text(parsed_html.model_dump_json(indent=4))
             except cloudpathlib.exceptions.OverwriteNewerCloudError as e:
                 _LOGGER.error(
                     "Attempted write to s3, received OverwriteNewerCloudError and therefore skipping.",

diff --git a/cli/parse_no_content_type.py b/cli/parse_no_content_type.py
@@ -43,7 +43,7 @@ def process_documents_with_no_content_type(
 
         output_path = output_dir / f"{task.document_id}.json"
         try:
-            output_path.write_text(output.json(indent=4, ensure_ascii=False))
+            output_path.write_text(output.model_dump_json(indent=4))
         except cloudpathlib.exceptions.OverwriteNewerCloudError as e:
             _LOGGER.error(
                 "Attempted write to s3, received OverwriteNewerCloudError and therefore skipping.",

diff --git a/cli/parse_pdfs.py b/cli/parse_pdfs.py
@@ -65,7 +65,7 @@ def copy_input_to_output_pdf(
         )
 
         try:
-            output_path.write_text(blank_output.json(indent=4, ensure_ascii=False))
+            output_path.write_text(blank_output.model_dump_json(indent=4))
             _LOGGER.info(
                 "Blank output saved.",
                 extra={
@@ -246,7 +246,7 @@ def add_parser_metadata(
 def save_api_response(
     azure_cache_dir: Union[Path, S3Path, None],
     input_task: ParserInput,
-    api_response_array: [AnalyzeResult],
+    api_response_array: List[AnalyzeResult],
 ) -> None:
     """Cache the raw api responses as an array of json data."""
     if azure_cache_dir:
@@ -318,7 +318,7 @@ def parse_file(
     if not output_path.exists():
         copy_input_to_output_pdf(input_task, output_path)
 
-    existing_parser_output = ParserOutput.parse_raw(output_path.read_text())
+    existing_parser_output = ParserOutput.model_validate_json(output_path.read_text())
     # If no parsed pdf data exists, assume we've not run before
     existing_pdf_data_exists = (
         existing_parser_output.pdf_data is not None
@@ -426,7 +426,7 @@ def parse_file(
             )
             return None
 
-        document: ParserOutput = azure_api_response_to_parser_output(
+        document = azure_api_response_to_parser_output(
             parser_input=input_task,
             md5_sum=calculate_pdf_md5sum(str(pdf_path)),
             api_response=api_response,
@@ -452,7 +452,7 @@ def parse_file(
         )
 
         try:
-            output_path.write_text(document.json(indent=4, ensure_ascii=False))
+            output_path.write_text(document.model_dump_json(indent=4))
         except cloudpathlib.exceptions.OverwriteNewerCloudError as e:
             _LOGGER.error(
                 "Attempted write to s3, received OverwriteNewerCloudError and "

diff --git a/cli/run_parser.py b/cli/run_parser.py
@@ -197,8 +197,8 @@ def main(
     tasks = []
     for path in files_to_parse:
         try:
-            tasks.append(ParserInput.parse_raw(path.read_text()))
-        except (pydantic.error_wrappers.ValidationError, KeyError) as e:
+            tasks.append(ParserInput.model_validate_json(path.read_text()))
+        except (pydantic.ValidationError, KeyError) as e:
             _LOGGER.error(
                 "Failed to parse input file.",
                 extra={

diff --git a/cli/test/test_run_parser.py b/cli/test/test_run_parser.py
@@ -82,7 +82,7 @@ def test_run_parser_local_parallel(
         }
 
         for output_file in Path(output_dir).glob("*.json"):
-            parser_output = ParserOutput.parse_file(output_file)
+            parser_output = ParserOutput.model_validate_json(output_file.read_text())
             assert isinstance(parser_output, ParserOutput)
 
             if parser_output.document_content_type == CONTENT_TYPE_HTML:
@@ -124,7 +124,7 @@ def test_run_parser_local_series(test_input_dir) -> None:
         }
 
         for output_file in Path(output_dir).glob("*.json"):
-            parser_output = ParserOutput.parse_file(output_file)
+            parser_output = ParserOutput.model_validate_json(output_file.read_text())
             assert isinstance(parser_output, ParserOutput)
 
             if parser_output.document_content_type == CONTENT_TYPE_HTML:
@@ -166,7 +166,7 @@ def test_run_parser_cache_azure_response_local(
         }
 
         for output_file in Path(output_dir).glob("*.json"):
-            parser_output = ParserOutput.parse_file(output_file)
+            parser_output = ParserOutput.model_validate_json(output_file.read_text())
             assert isinstance(parser_output, ParserOutput)
 
             if parser_output.document_content_type == CONTENT_TYPE_HTML:
@@ -296,7 +296,7 @@ def test_run_parser_skip_already_done(backend_document_json, caplog) -> None:
     with tempfile.TemporaryDirectory() as output_dir:
         with open(Path(output_dir) / "test_pdf.json", "w") as f:
             f.write(
-                ParserOutput.parse_obj(
+                ParserOutput.model_validate(
                     {
                         "document_id": "test_pdf",
                         "document_metadata": backend_document_json,
@@ -324,12 +324,12 @@ def test_run_parser_skip_already_done(backend_document_json, caplog) -> None:
                         },
                         "html_data": None,
                     }
-                ).json()
+                ).model_dump_json()
             )
 
         with open(Path(output_dir) / "test_html.json", "w") as f:
             f.write(
-                ParserOutput.parse_obj(
+                ParserOutput.model_validate(
                     {
                         "document_id": "test_html",
                         "document_metadata": backend_document_json,
@@ -356,7 +356,7 @@ def test_run_parser_skip_already_done(backend_document_json, caplog) -> None:
                         },
                         "pdf_data": None,
                     }
-                ).json()
+                ).model_dump_json()
             )
 
         runner = CliRunner()
@@ -387,10 +387,10 @@ def get_parser_output(
     """Generate the parser output objects for the tests given input variables."""
     return ParserOutput(
         document_id="sdf",
-        document_metadata=BackendDocument.parse_obj(document_metadata),
+        document_metadata=BackendDocument.model_validate(document_metadata),
         document_name="sdf",
         document_description="sdf",
-        document_source_url=source_url,
+        document_source_url=source_url, # type: ignore 
         document_cdn_object="sdf",
         document_content_type="text/html",
         document_md5_sum="sdf",
@@ -496,7 +496,7 @@ def test_fail_safely_on_azure_uncaught_exception(
         }
 
         for output_file in Path(output_dir).glob("*.json"):
-            parser_output = ParserOutput.parse_file(output_file)
+            parser_output = ParserOutput.model_validate_json(output_file.read_text())
             assert isinstance(parser_output, ParserOutput)
 
             # Any html data should be parsed successfully as it is not using the azure
@@ -548,7 +548,7 @@ def test_fail_safely_on_azure_service_request_error(
         }
 
         for output_file in Path(output_dir).glob("*.json"):
-            parser_output = ParserOutput.parse_file(output_file)
+            parser_output = ParserOutput.model_validate_json(output_file.read_text())
             assert isinstance(parser_output, ParserOutput)
 
             # Any html data should be parsed successfully as it is not using the azure
@@ -623,7 +623,7 @@ def test_fail_safely_on_azure_http_response_error(
             }
 
             for output_file in Path(output_dir).glob("*.json"):
-                parser_output = ParserOutput.parse_file(output_file)
+                parser_output = ParserOutput.model_validate_json(output_file.read_text())
                 assert isinstance(parser_output, ParserOutput)
 
                 # Any html data should be parsed successfully as it is not using the
@@ -735,7 +735,7 @@ def test_fail_safely_on_azure_http_response_error_large_doc(
             }
 
             for output_file in Path(output_dir).glob("*.json"):
-                parser_output = ParserOutput.parse_file(output_file)
+                parser_output = ParserOutput.model_validate_json(output_file.read_text())
                 assert isinstance(parser_output, ParserOutput)
 
                 # Any html data should be parsed successfully as it is not using the