1
- import unittest
2
-
3
1
import pydantic
2
+ import pytest
4
3
5
4
from cpr_data_access .parser_models import (
6
5
ParserInput ,
7
6
ParserOutput ,
7
+ VerticalFlipError ,
8
+ PDFTextBlock ,
8
9
)
9
10
from cpr_data_access .pipeline_general_models import (
10
11
CONTENT_TYPE_PDF ,
@@ -47,45 +48,37 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) -
47
48
parser_output_no_pdf_data ["pdf_data" ] = None
48
49
parser_output_no_pdf_data ["document_content_type" ] = CONTENT_TYPE_PDF
49
50
50
- with unittest .TestCase ().assertRaises (
51
- pydantic .error_wrappers .ValidationError
52
- ) as context :
51
+ with pytest .raises (pydantic .error_wrappers .ValidationError ) as context :
53
52
ParserOutput .parse_obj (parser_output_no_pdf_data )
54
- assert "pdf_data must be set for PDF documents" in str (context .exception )
53
+ assert "pdf_data must be set for PDF documents" in str (context .value )
55
54
56
55
parser_output_no_html_data = parser_output_json_pdf .copy ()
57
56
parser_output_no_html_data ["html_data" ] = None
58
57
parser_output_no_html_data ["document_content_type" ] = CONTENT_TYPE_HTML
59
58
60
- with unittest .TestCase ().assertRaises (
61
- pydantic .error_wrappers .ValidationError
62
- ) as context :
59
+ with pytest .raises (pydantic .error_wrappers .ValidationError ) as context :
63
60
ParserOutput .parse_obj (parser_output_no_html_data )
64
- assert "html_data must be set for HTML documents" in str (context .exception )
61
+ assert "html_data must be set for HTML documents" in str (context .value )
65
62
66
63
parser_output_no_content_type = parser_output_json_pdf .copy ()
67
64
# PDF data is set as the default
68
65
parser_output_no_content_type ["document_content_type" ] = None
69
66
70
- with unittest .TestCase ().assertRaises (
71
- pydantic .error_wrappers .ValidationError
72
- ) as context :
67
+ with pytest .raises (pydantic .error_wrappers .ValidationError ) as context :
73
68
ParserOutput .parse_obj (parser_output_no_content_type )
74
69
assert (
75
70
"html_data and pdf_data must be null for documents with no content type."
76
- ) in str (context .exception )
71
+ ) in str (context .value )
77
72
78
73
parser_output_not_known_content_type = parser_output_json_pdf .copy ()
79
74
# PDF data is set as the default
80
75
parser_output_not_known_content_type ["document_content_type" ] = "not_known"
81
76
82
- with unittest .TestCase ().assertRaises (
83
- pydantic .error_wrappers .ValidationError
84
- ) as context :
77
+ with pytest .raises (pydantic .error_wrappers .ValidationError ) as context :
85
78
ParserOutput .parse_obj (parser_output_not_known_content_type )
86
79
assert (
87
80
"html_data and pdf_data must be null for documents with no content type."
88
- ) in str (context .exception )
81
+ ) in str (context .value )
89
82
90
83
# Test the text blocks property
91
84
assert ParserOutput .parse_obj (parser_output_json_pdf ).text_blocks != []
@@ -103,6 +96,17 @@ def test_parser_output_object(parser_output_json_pdf, parser_output_json_html) -
103
96
original_text_blocks = parser_output .text_blocks
104
97
assert parser_output .vertically_flip_text_block_coords () != original_text_blocks
105
98
99
+ parser_output = ParserOutput .parse_obj (parser_output_json_pdf )
100
+ # Set as page number that doesn't exist in the page_metadata field to throw exception
101
+ assert isinstance (parser_output .text_blocks [0 ], PDFTextBlock )
102
+ parser_output .text_blocks [0 ].page_number = 123456 # type: ignore
103
+
104
+ with pytest .raises (VerticalFlipError ) as context :
105
+ parser_output .vertically_flip_text_block_coords ()
106
+ assert str (context .value ) == (
107
+ f"Failed to flip text blocks for { parser_output .document_id } "
108
+ )
109
+
106
110
# Test the get_text_blocks method
107
111
# The test html document has invalid html data so the text blocks should be empty
108
112
parser_output = ParserOutput .parse_obj (parser_output_json_html )
0 commit comments