Skip to content

Commit

Permalink
Merge pull request #118 from climatepolicyradar/feature/integrate-lat…
Browse files Browse the repository at this point in the history
…est-azure-pdf-parser

Integrate latest azure pdf parser and pydantic.
  • Loading branch information
THOR300 authored Nov 9, 2023
2 parents cc7f44a + b8c77d2 commit 8c70f80
Show file tree
Hide file tree
Showing 13 changed files with 189 additions and 84 deletions.
8 changes: 5 additions & 3 deletions cli/parse_htmls.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def copy_input_to_output_html(
pdf_data=None,
)

output_path.write_text(blank_output.json(indent=4, ensure_ascii=False))
output_path.write_text(blank_output.model_dump_json(indent=4))

_LOGGER.info(
"Blank html output saved.",
Expand Down Expand Up @@ -84,7 +84,9 @@ def run_html_parser(
if not output_path.exists():
copy_input_to_output_html(task, output_path)

existing_parser_output = ParserOutput.parse_raw(output_path.read_text())
existing_parser_output = ParserOutput.model_validate_json(
output_path.read_text()
)
# If no parsed html dta exists, assume we've not run before
existing_html_data_exists = (
existing_parser_output.html_data is not None
Expand All @@ -107,7 +109,7 @@ def run_html_parser(
parsed_html = html_parser.parse(task).detect_and_set_languages()

try:
output_path.write_text(parsed_html.json(indent=4, ensure_ascii=False))
output_path.write_text(parsed_html.model_dump_json(indent=4))
except cloudpathlib.exceptions.OverwriteNewerCloudError as e:
_LOGGER.error(
"Attempted write to s3, received OverwriteNewerCloudError and therefore skipping.",
Expand Down
2 changes: 1 addition & 1 deletion cli/parse_no_content_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def process_documents_with_no_content_type(

output_path = output_dir / f"{task.document_id}.json"
try:
output_path.write_text(output.json(indent=4, ensure_ascii=False))
output_path.write_text(output.model_dump_json(indent=4))
except cloudpathlib.exceptions.OverwriteNewerCloudError as e:
_LOGGER.error(
"Attempted write to s3, received OverwriteNewerCloudError and therefore skipping.",
Expand Down
10 changes: 5 additions & 5 deletions cli/parse_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def copy_input_to_output_pdf(
)

try:
output_path.write_text(blank_output.json(indent=4, ensure_ascii=False))
output_path.write_text(blank_output.model_dump_json(indent=4))
_LOGGER.info(
"Blank output saved.",
extra={
Expand Down Expand Up @@ -246,7 +246,7 @@ def add_parser_metadata(
def save_api_response(
azure_cache_dir: Union[Path, S3Path, None],
input_task: ParserInput,
api_response_array: [AnalyzeResult],
api_response_array: List[AnalyzeResult],
) -> None:
"""Cache the raw api responses as an array of json data."""
if azure_cache_dir:
Expand Down Expand Up @@ -318,7 +318,7 @@ def parse_file(
if not output_path.exists():
copy_input_to_output_pdf(input_task, output_path)

existing_parser_output = ParserOutput.parse_raw(output_path.read_text())
existing_parser_output = ParserOutput.model_validate_json(output_path.read_text())
# If no parsed pdf data exists, assume we've not run before
existing_pdf_data_exists = (
existing_parser_output.pdf_data is not None
Expand Down Expand Up @@ -426,7 +426,7 @@ def parse_file(
)
return None

document: ParserOutput = azure_api_response_to_parser_output(
document = azure_api_response_to_parser_output(
parser_input=input_task,
md5_sum=calculate_pdf_md5sum(str(pdf_path)),
api_response=api_response,
Expand All @@ -452,7 +452,7 @@ def parse_file(
)

try:
output_path.write_text(document.json(indent=4, ensure_ascii=False))
output_path.write_text(document.model_dump_json(indent=4))
except cloudpathlib.exceptions.OverwriteNewerCloudError as e:
_LOGGER.error(
"Attempted write to s3, received OverwriteNewerCloudError and "
Expand Down
4 changes: 2 additions & 2 deletions cli/run_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,8 @@ def main(
tasks = []
for path in files_to_parse:
try:
tasks.append(ParserInput.parse_raw(path.read_text()))
except (pydantic.error_wrappers.ValidationError, KeyError) as e:
tasks.append(ParserInput.model_validate_json(path.read_text()))
except (pydantic.ValidationError, KeyError) as e:
_LOGGER.error(
"Failed to parse input file.",
extra={
Expand Down
31 changes: 18 additions & 13 deletions cli/test/test_run_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from azure_pdf_parser.base import PDFPagesBatchExtracted
from azure.ai.formrecognizer import AnalyzeResult
from mock import patch
from pydantic import AnyHttpUrl

from cli.run_parser import main as cli_main
from cli.translate_outputs import should_be_translated, identify_translation_languages
Expand Down Expand Up @@ -82,7 +83,7 @@ def test_run_parser_local_parallel(
}

for output_file in Path(output_dir).glob("*.json"):
parser_output = ParserOutput.parse_file(output_file)
parser_output = ParserOutput.model_validate_json(output_file.read_text())
assert isinstance(parser_output, ParserOutput)

if parser_output.document_content_type == CONTENT_TYPE_HTML:
Expand Down Expand Up @@ -124,7 +125,7 @@ def test_run_parser_local_series(test_input_dir) -> None:
}

for output_file in Path(output_dir).glob("*.json"):
parser_output = ParserOutput.parse_file(output_file)
parser_output = ParserOutput.model_validate_json(output_file.read_text())
assert isinstance(parser_output, ParserOutput)

if parser_output.document_content_type == CONTENT_TYPE_HTML:
Expand Down Expand Up @@ -166,7 +167,7 @@ def test_run_parser_cache_azure_response_local(
}

for output_file in Path(output_dir).glob("*.json"):
parser_output = ParserOutput.parse_file(output_file)
parser_output = ParserOutput.model_validate_json(output_file.read_text())
assert isinstance(parser_output, ParserOutput)

if parser_output.document_content_type == CONTENT_TYPE_HTML:
Expand Down Expand Up @@ -296,7 +297,7 @@ def test_run_parser_skip_already_done(backend_document_json, caplog) -> None:
with tempfile.TemporaryDirectory() as output_dir:
with open(Path(output_dir) / "test_pdf.json", "w") as f:
f.write(
ParserOutput.parse_obj(
ParserOutput.model_validate(
{
"document_id": "test_pdf",
"document_metadata": backend_document_json,
Expand Down Expand Up @@ -324,12 +325,12 @@ def test_run_parser_skip_already_done(backend_document_json, caplog) -> None:
},
"html_data": None,
}
).json()
).model_dump_json()
)

with open(Path(output_dir) / "test_html.json", "w") as f:
f.write(
ParserOutput.parse_obj(
ParserOutput.model_validate(
{
"document_id": "test_html",
"document_metadata": backend_document_json,
Expand All @@ -356,7 +357,7 @@ def test_run_parser_skip_already_done(backend_document_json, caplog) -> None:
},
"pdf_data": None,
}
).json()
).model_dump_json()
)

runner = CliRunner()
Expand Down Expand Up @@ -387,10 +388,10 @@ def get_parser_output(
"""Generate the parser output objects for the tests given input variables."""
return ParserOutput(
document_id="sdf",
document_metadata=BackendDocument.parse_obj(document_metadata),
document_metadata=BackendDocument.model_validate(document_metadata),
document_name="sdf",
document_description="sdf",
document_source_url=source_url,
document_source_url=AnyHttpUrl(source_url) if source_url else None,
document_cdn_object="sdf",
document_content_type="text/html",
document_md5_sum="sdf",
Expand Down Expand Up @@ -496,7 +497,7 @@ def test_fail_safely_on_azure_uncaught_exception(
}

for output_file in Path(output_dir).glob("*.json"):
parser_output = ParserOutput.parse_file(output_file)
parser_output = ParserOutput.model_validate_json(output_file.read_text())
assert isinstance(parser_output, ParserOutput)

# Any html data should be parsed successfully as it is not using the azure
Expand Down Expand Up @@ -548,7 +549,7 @@ def test_fail_safely_on_azure_service_request_error(
}

for output_file in Path(output_dir).glob("*.json"):
parser_output = ParserOutput.parse_file(output_file)
parser_output = ParserOutput.model_validate_json(output_file.read_text())
assert isinstance(parser_output, ParserOutput)

# Any html data should be parsed successfully as it is not using the azure
Expand Down Expand Up @@ -623,7 +624,9 @@ def test_fail_safely_on_azure_http_response_error(
}

for output_file in Path(output_dir).glob("*.json"):
parser_output = ParserOutput.parse_file(output_file)
parser_output = ParserOutput.model_validate_json(
output_file.read_text()
)
assert isinstance(parser_output, ParserOutput)

# Any html data should be parsed successfully as it is not using the
Expand Down Expand Up @@ -735,7 +738,9 @@ def test_fail_safely_on_azure_http_response_error_large_doc(
}

for output_file in Path(output_dir).glob("*.json"):
parser_output = ParserOutput.parse_file(output_file)
parser_output = ParserOutput.model_validate_json(
output_file.read_text()
)
assert isinstance(parser_output, ParserOutput)

# Any html data should be parsed successfully as it is not using the
Expand Down
4 changes: 2 additions & 2 deletions cli/translate_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def translate_parser_outputs(
)

try:
parser_output = ParserOutput.parse_raw(path.read_text())
parser_output = ParserOutput.model_validate_json(path.read_text())
_LOGGER.debug(
"Successfully parsed document from output dir during translation processing.",
extra={"props": {"path": f"{path}"}},
Expand Down Expand Up @@ -164,7 +164,7 @@ def _translate_to_target_languages(

try:
output_path.write_text( # type: ignore
translated_parser_output.json(indent=4, ensure_ascii=False)
translated_parser_output.model_dump_json(indent=4)
)
_LOGGER.info(
"Saved translated output for document.",
Expand Down
Loading

0 comments on commit 8c70f80

Please sign in to comment.