Merge pull request #47 from climatepolicyradar/feature/pdct-471-bug-fix-parser-output-method-that-returns-bad-text

THOR300 · web-flow · commit 896497058b6d · 2023-10-05T15:51:44.000+01:00
Adding exception to the method.
diff --git a/src/cpr_data_access/parser_models.py b/src/cpr_data_access/parser_models.py
@@ -19,6 +19,12 @@
 logger = logging.getLogger(__name__)
 
 
+class VerticalFlipError(Exception):
+    """Exception for when a vertical flip fails."""
+
+    pass
+
+
 class BlockType(str, Enum):
     """
     List of possible block types from the PubLayNet model.
@@ -319,6 +325,10 @@ def vertically_flip_text_block_coords(self: _PO) -> _PO:
         Flips the coordinates of all PDF text blocks vertically.
 
         Acts in-place on the coordinates in the ParserOutput object.
+
+        Should the document fail to flip, a VerticalFlipError is raised. This is most
+        commonly due to a page number being referenced in a text block that doesn't
+        exist in the page_metadata mapping.
         """
 
         if self.pdf_data is None:
@@ -344,11 +354,14 @@ def vertically_flip_text_block_coords(self: _PO) -> _PO:
                         text_block.coords[1],
                         text_block.coords[0],
                     ]
-        except Exception:
+        except Exception as e:
             logger.exception(
                 "Error flipping text block coordinates.",
                 extra={"props": {"document_id": self.document_id}},
             )
+            raise VerticalFlipError(
+                f"Failed to flip text blocks for {self.document_id}"
+            ) from e
 
         return self
 
diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py
@@ -1,10 +1,11 @@
-import unittest
-
 import pydantic
+import pytest
 
 from cpr_data_access.parser_models import (
     ParserInput,
     ParserOutput,
+    VerticalFlipError,
+    PDFTextBlock,
 )
 from cpr_data_access.pipeline_general_models import (
     CONTENT_TYPE_PDF,
@@ -47,45 +48,37 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) -
     parser_output_no_pdf_data["pdf_data"] = None
     parser_output_no_pdf_data["document_content_type"] = CONTENT_TYPE_PDF
 
-    with unittest.TestCase().assertRaises(
-        pydantic.error_wrappers.ValidationError
-    ) as context:
+    with pytest.raises(pydantic.error_wrappers.ValidationError) as context:
         ParserOutput.parse_obj(parser_output_no_pdf_data)
-    assert "pdf_data must be set for PDF documents" in str(context.exception)
+    assert "pdf_data must be set for PDF documents" in str(context.value)
 
     parser_output_no_html_data = parser_output_json_pdf.copy()
     parser_output_no_html_data["html_data"] = None
     parser_output_no_html_data["document_content_type"] = CONTENT_TYPE_HTML
 
-    with unittest.TestCase().assertRaises(
-        pydantic.error_wrappers.ValidationError
-    ) as context:
+    with pytest.raises(pydantic.error_wrappers.ValidationError) as context:
         ParserOutput.parse_obj(parser_output_no_html_data)
-    assert "html_data must be set for HTML documents" in str(context.exception)
+    assert "html_data must be set for HTML documents" in str(context.value)
 
     parser_output_no_content_type = parser_output_json_pdf.copy()
     # PDF data is set as the default
     parser_output_no_content_type["document_content_type"] = None
 
-    with unittest.TestCase().assertRaises(
-        pydantic.error_wrappers.ValidationError
-    ) as context:
+    with pytest.raises(pydantic.error_wrappers.ValidationError) as context:
         ParserOutput.parse_obj(parser_output_no_content_type)
     assert (
         "html_data and pdf_data must be null for documents with no content type."
-    ) in str(context.exception)
+    ) in str(context.value)
 
     parser_output_not_known_content_type = parser_output_json_pdf.copy()
     # PDF data is set as the default
     parser_output_not_known_content_type["document_content_type"] = "not_known"
 
-    with unittest.TestCase().assertRaises(
-        pydantic.error_wrappers.ValidationError
-    ) as context:
+    with pytest.raises(pydantic.error_wrappers.ValidationError) as context:
         ParserOutput.parse_obj(parser_output_not_known_content_type)
     assert (
         "html_data and pdf_data must be null for documents with no content type."
-    ) in str(context.exception)
+    ) in str(context.value)
 
     # Test the text blocks property
     assert ParserOutput.parse_obj(parser_output_json_pdf).text_blocks != []
@@ -103,6 +96,17 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) -
     original_text_blocks = parser_output.text_blocks
     assert parser_output.vertically_flip_text_block_coords() != original_text_blocks
 
+    parser_output = ParserOutput.parse_obj(parser_output_json_pdf)
+    # Set as page number that doesn't exist in the page_metadata field to throw exception
+    assert isinstance(parser_output.text_blocks[0], PDFTextBlock)
+    parser_output.text_blocks[0].page_number = 123456  # type: ignore
+
+    with pytest.raises(VerticalFlipError) as context:
+        parser_output.vertically_flip_text_block_coords()
+    assert str(context.value) == (
+        f"Failed to flip text blocks for {parser_output.document_id}"
+    )
+
     # Test the get_text_blocks method
     # The test html document has invalid html data so the text blocks should be empty
     parser_output = ParserOutput.parse_obj(parser_output_json_html)