@@ -63,13 +63,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
63
63
warnings = []
64
64
65
65
with tempfile .TemporaryDirectory () as tmp_dir :
66
- lines , tables , tables_on_images , attachments , document_metadata = self .__extract (
67
- path = file_path ,
68
- parameters = parameters ,
69
- warnings = warnings ,
70
- tmp_dir = tmp_dir
71
- )
72
- lines = self .linker .link_objects (lines = lines , tables = tables_on_images , images = attachments )
66
+ lines , tables , attachments , document_metadata = self .__extract (path = file_path , parameters = parameters , warnings = warnings , tmp_dir = tmp_dir )
73
67
74
68
if get_param_with_attachments (parameters ) and self .attachment_extractor .can_extract (file_path ):
75
69
attachments += self .attachment_extractor .extract (file_path = file_path , parameters = parameters )
@@ -80,15 +74,15 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
80
74
81
75
return self ._postprocess (result )
82
76
83
- def __extract (self , path : str , parameters : dict , warnings : list , tmp_dir : str )\
84
- -> Tuple [List [LineWithMeta ], List [Table ], List [ScanTable ], List [ PdfImageAttachment ], Optional [dict ]]:
77
+ def __extract (self , path : str , parameters : dict , warnings : List [ str ] , tmp_dir : str )\
78
+ -> Tuple [List [LineWithMeta ], List [Table ], List [PdfImageAttachment ], Optional [dict ]]:
85
79
import math
86
80
from dedoc .utils .pdf_utils import get_pdf_page_count
87
81
from dedoc .utils .utils import calculate_file_hash
88
82
from dedoc .utils .parameter_utils import get_param_page_slice , get_param_with_attachments
89
83
from dedoc .utils .parameter_utils import get_param_need_gost_frame_analysis
90
84
91
- all_lines , all_tables , all_tables_on_images , all_attached_images = [], [], [], []
85
+ all_lines , all_tables , all_scan_tables , all_attached_images = [], [], [], []
92
86
with_attachments = get_param_with_attachments (parameters )
93
87
document_metadata = None
94
88
@@ -106,12 +100,11 @@ def __extract(self, path: str, parameters: dict, warnings: list, tmp_dir: str)\
106
100
document_metadata ["last_page" ] = last_page
107
101
108
102
if empty_page_limit :
109
- return all_lines , all_tables , all_tables_on_images , all_attached_images , document_metadata
103
+ return all_lines , all_tables , all_attached_images , document_metadata
110
104
111
- gost_json_path = ""
112
- remove_frame = get_param_need_gost_frame_analysis (parameters )
113
- if remove_frame :
114
- gost_json_path = self .__save_gost_frame_boxes_to_json (first_page = first_page , last_page = last_page , page_count = page_count , tmp_dir = tmp_dir , path = path )
105
+ remove_gost_frame = get_param_need_gost_frame_analysis (parameters )
106
+ gost_json_path = self .__save_gost_frame_boxes_to_json (first_page = first_page , last_page = last_page , page_count = page_count , tmp_dir = tmp_dir , path = path ) \
107
+ if remove_gost_frame else ""
115
108
116
109
# in java tabby reader page numeration starts with 1, end_page is included
117
110
first_tabby_page = first_page + 1 if first_page is not None else 1
@@ -122,24 +115,26 @@ def __extract(self, path: str, parameters: dict, warnings: list, tmp_dir: str)\
122
115
end_page = last_tabby_page ,
123
116
tmp_dir = tmp_dir ,
124
117
gost_json_path = gost_json_path ,
125
- remove_frame = remove_frame )
118
+ remove_frame = remove_gost_frame )
126
119
127
120
pages = document .get ("pages" , [])
128
121
for page in pages :
129
122
page_lines = self .__get_lines_with_location (page , file_hash )
130
123
if page_lines :
131
124
all_lines .extend (page_lines )
132
- page_tables , table_on_images = self .__get_tables (page )
133
- assert len (page_tables ) == len (table_on_images )
134
- if page_tables :
135
- all_tables .extend (page_tables )
136
- all_tables_on_images .extend (table_on_images )
125
+ scan_tables = self .__get_tables (page )
126
+ all_scan_tables .extend (scan_tables )
137
127
138
128
attached_images = self .__get_attached_images (page = page , parameters = parameters , path = path ) if with_attachments else []
139
129
if attached_images :
140
130
all_attached_images .extend (attached_images )
141
131
142
- return all_lines , all_tables , all_tables_on_images , all_attached_images , document_metadata
132
+ mp_tables = self .table_recognizer .convert_to_multipages_tables (all_scan_tables , lines_with_meta = all_lines )
133
+ all_lines = self .linker .link_objects (lines = all_lines , tables = mp_tables , images = all_attached_images )
134
+
135
+ tables = [scan_table .to_table () for scan_table in mp_tables ]
136
+
137
+ return all_lines , tables , all_attached_images , document_metadata
143
138
144
139
def __save_gost_frame_boxes_to_json (self , first_page : Optional [int ], last_page : Optional [int ], page_count : int , path : str , tmp_dir : str ) -> str :
145
140
from joblib import Parallel , delayed
@@ -150,24 +145,25 @@ def __save_gost_frame_boxes_to_json(self, first_page: Optional[int], last_page:
150
145
images = self ._get_images (path , first_page , last_page )
151
146
152
147
gost_analyzed_images = Parallel (n_jobs = self .config ["n_jobs" ])(delayed (self .gost_frame_recognizer .rec_and_clean_frame )(image ) for image in images )
148
+
153
149
result_dict = {
154
150
page_number : {** page_data [1 ].to_dict (), ** {"original_image_width" : page_data [2 ][1 ], "original_image_height" : page_data [2 ][0 ]}}
155
151
for page_number , page_data in enumerate (gost_analyzed_images , start = first_page )
156
152
}
153
+
157
154
result_json_path = os .path .join (tmp_dir , "gost_frame_bboxes.json" )
158
155
with open (result_json_path , "w" ) as f :
159
156
json .dump (result_dict , f )
157
+
160
158
return result_json_path
161
159
162
- def __get_tables (self , page : dict ) -> Tuple [ List [Table ], List [ ScanTable ] ]:
160
+ def __get_tables (self , page : dict ) -> List [ScanTable ]:
163
161
import uuid
164
162
from dedoc .data_structures .cell_with_meta import CellWithMeta
165
163
from dedoc .data_structures .concrete_annotations .bbox_annotation import BBoxAnnotation
166
164
from dedoc .data_structures .line_metadata import LineMetadata
167
- from dedoc .data_structures .table_metadata import TableMetadata
168
165
169
- tables = []
170
- tables_on_image = []
166
+ scan_tables = []
171
167
page_number = page ["number" ]
172
168
page_width = int (page ["width" ])
173
169
page_height = int (page ["height" ])
@@ -179,7 +175,7 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
179
175
cell_properties = table ["cell_properties" ]
180
176
assert len (rows ) == len (cell_properties )
181
177
182
- result_cells = []
178
+ cells = []
183
179
for num_row , row in enumerate (rows ):
184
180
assert len (row ) == len (cell_properties [num_row ])
185
181
@@ -191,20 +187,22 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
191
187
for c in cell_blocks :
192
188
cell_bbox = BBox (x_top_left = int (c ["x_top_left" ]), y_top_left = int (c ["y_top_left" ]), width = int (c ["width" ]), height = int (c ["height" ]))
193
189
annotations .append (BBoxAnnotation (c ["start" ], c ["end" ], cell_bbox , page_width = page_width , page_height = page_height ))
190
+ """
191
+ TODO: change to Cell class after tabby can return cell coordinates. Then set type Cell in class "ScanTable"
192
+ https://jira.intra.ispras.ru/browse/TLDR-851
193
+ """
194
194
195
195
result_row .append (CellWithMeta (
196
196
lines = [LineWithMeta (line = cell ["text" ], metadata = LineMetadata (page_id = page_number , line_id = 0 ), annotations = annotations )],
197
197
colspan = cell_properties [num_row ][num_col ]["col_span" ],
198
198
rowspan = cell_properties [num_row ][num_col ]["row_span" ],
199
199
invisible = bool (cell_properties [num_row ][num_col ]["invisible" ])
200
200
))
201
- result_cells .append (result_row )
201
+ cells .append (result_row )
202
202
203
- table_name = str (uuid .uuid4 ())
204
- tables .append (Table (cells = result_cells , metadata = TableMetadata (page_id = page_number , uid = table_name )))
205
- tables_on_image .append (ScanTable (page_number = page_number , matrix_cells = None , bbox = table_bbox , name = table_name , order = order ))
203
+ scan_tables .append (ScanTable (page_number = page_number , matrix_cells = cells , bbox = table_bbox , name = str (uuid .uuid4 ()), order = order ))
206
204
207
- return tables , tables_on_image
205
+ return scan_tables
208
206
209
207
def __get_attached_images (self , page : dict , parameters : dict , path : str ) -> List [PdfImageAttachment ]:
210
208
import os
0 commit comments