Skip to content
This repository was archived by the owner on Sep 11, 2024. It is now read-only.

Commit 8964970

Browse files
authored
Merge pull request #47 from climatepolicyradar/feature/pdct-471-bug-fix-parser-output-method-that-returns-bad-text
Adding exception to the method.
2 parents 56217c3 + bb01714 commit 8964970

File tree

2 files changed

+36
-19
lines changed

2 files changed

+36
-19
lines changed

src/cpr_data_access/parser_models.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@
1919
logger = logging.getLogger(__name__)
2020

2121

22+
class VerticalFlipError(Exception):
23+
"""Exception for when a vertical flip fails."""
24+
25+
pass
26+
27+
2228
class BlockType(str, Enum):
2329
"""
2430
List of possible block types from the PubLayNet model.
@@ -319,6 +325,10 @@ def vertically_flip_text_block_coords(self: _PO) -> _PO:
319325
Flips the coordinates of all PDF text blocks vertically.
320326
321327
Acts in-place on the coordinates in the ParserOutput object.
328+
329+
Should the document fail to flip, a VerticalFlipError is raised. This is most
330+
commonly due to a page number being referenced in a text block that doesn't
331+
exist in the page_metadata mapping.
322332
"""
323333

324334
if self.pdf_data is None:
@@ -344,11 +354,14 @@ def vertically_flip_text_block_coords(self: _PO) -> _PO:
344354
text_block.coords[1],
345355
text_block.coords[0],
346356
]
347-
except Exception:
357+
except Exception as e:
348358
logger.exception(
349359
"Error flipping text block coordinates.",
350360
extra={"props": {"document_id": self.document_id}},
351361
)
362+
raise VerticalFlipError(
363+
f"Failed to flip text blocks for {self.document_id}"
364+
) from e
352365

353366
return self
354367

tests/test_parser_models.py

+22-18
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
import unittest
2-
31
import pydantic
2+
import pytest
43

54
from cpr_data_access.parser_models import (
65
ParserInput,
76
ParserOutput,
7+
VerticalFlipError,
8+
PDFTextBlock,
89
)
910
from cpr_data_access.pipeline_general_models import (
1011
CONTENT_TYPE_PDF,
@@ -47,45 +48,37 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) -
4748
parser_output_no_pdf_data["pdf_data"] = None
4849
parser_output_no_pdf_data["document_content_type"] = CONTENT_TYPE_PDF
4950

50-
with unittest.TestCase().assertRaises(
51-
pydantic.error_wrappers.ValidationError
52-
) as context:
51+
with pytest.raises(pydantic.error_wrappers.ValidationError) as context:
5352
ParserOutput.parse_obj(parser_output_no_pdf_data)
54-
assert "pdf_data must be set for PDF documents" in str(context.exception)
53+
assert "pdf_data must be set for PDF documents" in str(context.value)
5554

5655
parser_output_no_html_data = parser_output_json_pdf.copy()
5756
parser_output_no_html_data["html_data"] = None
5857
parser_output_no_html_data["document_content_type"] = CONTENT_TYPE_HTML
5958

60-
with unittest.TestCase().assertRaises(
61-
pydantic.error_wrappers.ValidationError
62-
) as context:
59+
with pytest.raises(pydantic.error_wrappers.ValidationError) as context:
6360
ParserOutput.parse_obj(parser_output_no_html_data)
64-
assert "html_data must be set for HTML documents" in str(context.exception)
61+
assert "html_data must be set for HTML documents" in str(context.value)
6562

6663
parser_output_no_content_type = parser_output_json_pdf.copy()
6764
# PDF data is set as the default
6865
parser_output_no_content_type["document_content_type"] = None
6966

70-
with unittest.TestCase().assertRaises(
71-
pydantic.error_wrappers.ValidationError
72-
) as context:
67+
with pytest.raises(pydantic.error_wrappers.ValidationError) as context:
7368
ParserOutput.parse_obj(parser_output_no_content_type)
7469
assert (
7570
"html_data and pdf_data must be null for documents with no content type."
76-
) in str(context.exception)
71+
) in str(context.value)
7772

7873
parser_output_not_known_content_type = parser_output_json_pdf.copy()
7974
# PDF data is set as the default
8075
parser_output_not_known_content_type["document_content_type"] = "not_known"
8176

82-
with unittest.TestCase().assertRaises(
83-
pydantic.error_wrappers.ValidationError
84-
) as context:
77+
with pytest.raises(pydantic.error_wrappers.ValidationError) as context:
8578
ParserOutput.parse_obj(parser_output_not_known_content_type)
8679
assert (
8780
"html_data and pdf_data must be null for documents with no content type."
88-
) in str(context.exception)
81+
) in str(context.value)
8982

9083
# Test the text blocks property
9184
assert ParserOutput.parse_obj(parser_output_json_pdf).text_blocks != []
@@ -103,6 +96,17 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) -
10396
original_text_blocks = parser_output.text_blocks
10497
assert parser_output.vertically_flip_text_block_coords() != original_text_blocks
10598

99+
parser_output = ParserOutput.parse_obj(parser_output_json_pdf)
100+
# Set as page number that doesn't exist in the page_metadata field to throw exception
101+
assert isinstance(parser_output.text_blocks[0], PDFTextBlock)
102+
parser_output.text_blocks[0].page_number = 123456 # type: ignore
103+
104+
with pytest.raises(VerticalFlipError) as context:
105+
parser_output.vertically_flip_text_block_coords()
106+
assert str(context.value) == (
107+
f"Failed to flip text blocks for {parser_output.document_id}"
108+
)
109+
106110
# Test the get_text_blocks method
107111
# The test html document has invalid html data so the text blocks should be empty
108112
parser_output = ParserOutput.parse_obj(parser_output_json_html)

0 commit comments

Comments
 (0)