Skip to content

Commit 0230daa

Browse files
committed
TLDR-850 added multipage into tabby reader
1 parent e978200 commit 0230daa

File tree

8 files changed

+123
-97
lines changed

8 files changed

+123
-97
lines changed

dedoc/readers/pdf_reader/data_classes/tables/cell.py

+9-16
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
from dedocutils.data_structures import BBox
44

55
from dedoc.data_structures.annotation import Annotation
6+
from dedoc.data_structures.cell_with_meta import CellWithMeta
67
from dedoc.data_structures.line_with_meta import LineWithMeta
78

89

9-
class Cell:
10+
class Cell(CellWithMeta):
1011

1112
@staticmethod
1213
def copy_from(cell: "Cell",
@@ -41,35 +42,27 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int)
4142
if self.con_coord:
4243
self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)
4344

44-
def __init__(self,
45-
x_top_left: int,
46-
x_bottom_right: int,
47-
y_top_left: int,
48-
y_bottom_right: int,
49-
id_con: int = -1,
50-
lines: Optional[List[LineWithMeta]] = None,
51-
is_attribute: bool = False,
52-
is_attribute_required: bool = False,
53-
rotated_angle: int = 0,
54-
uid: str = None,
45+
def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
46+
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None,
5547
contour_coord: Optional[BBox] = None) -> None:
48+
5649
import uuid
5750

5851
assert x_top_left <= x_bottom_right
5952
assert y_top_left <= y_bottom_right
53+
54+
self.lines = [] if lines is None else lines
55+
super().__init__(lines)
56+
6057
self.x_top_left = x_top_left
6158
self.x_bottom_right = x_bottom_right
6259
self.y_top_left = y_top_left
6360
self.y_bottom_right = y_bottom_right
6461
self.id_con = id_con
65-
self.lines = [] if lines is None else lines
6662
self.is_attribute = is_attribute
6763
self.is_attribute_required = is_attribute_required
6864
self.rotated_angle = rotated_angle
6965
self.cell_uid = f"cell_{uuid.uuid1()}" if uid is None else uid
70-
self.colspan = 1
71-
self.rowspan = 1
72-
self.invisible = False
7366
self.con_coord = contour_coord or BBox(0, 0, 0, 0)
7467

7568
def __str__(self) -> str:

dedoc/readers/pdf_reader/data_classes/tables/location.py

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle:
1010
self.page_number = page_number
1111
self.bbox = bbox
1212
self.name = name
13+
# TODO put self.order (change LineWithLocation, PdfImageAttachment, ScanTable)
1314
self.rotated_angle = rotated_angle
1415

1516
def shift(self, shift_x: int, shift_y: int) -> None:

dedoc/readers/pdf_reader/data_classes/tables/scantable.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, List
1+
from typing import Any, List, Optional
22

33
from dedocutils.data_structures import BBox
44

@@ -10,7 +10,8 @@
1010

1111

1212
class ScanTable:
13-
def __init__(self, page_number: int, matrix_cells: List[List[Cell]] = None, bbox: BBox = None, name: str = "", order: int = -1) -> None:
13+
def __init__(self, page_number: int, matrix_cells: Optional[List[List[CellWithMeta]]] = None, bbox: Optional[BBox] = None,
14+
name: str = "", order: int = -1) -> None:
1415
self.matrix_cells = matrix_cells
1516
self.page_number = page_number
1617
self.locations = []
@@ -27,6 +28,15 @@ def extended(self, table: "ScanTable") -> None:
2728
# extend order
2829
self.order = max(self.order, table.order)
2930

31+
def check_on_cell_instance(self) -> bool:
32+
if len(self.matrix_cells) == 0:
33+
return False
34+
if len(self.matrix_cells[0]) == 0:
35+
return False
36+
if not isinstance(self.matrix_cells[0][0], Cell):
37+
return False
38+
return True
39+
3040
def to_table(self) -> Table:
3141
metadata = TableMetadata(page_id=self.page_number, uid=self.name, rotated_angle=self.location.rotated_angle)
3242
cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in self.matrix_cells]

dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import copy
12
import logging
23
from typing import List
34

@@ -155,24 +156,26 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool:
155156
# condition 2. Exclusion of the duplicated header (if any)
156157
attr1 = TableAttributeExtractor.get_header_table(t1.matrix_cells)
157158
attr2 = TableAttributeExtractor.get_header_table(t2.matrix_cells)
159+
t2_update = copy.deepcopy(t2)
158160
if TableAttributeExtractor.is_equal_attributes(attr1, attr2):
159-
t2.matrix_cells = t2.matrix_cells[len(attr2):]
161+
t2_update.matrix_cells = t2_update.matrix_cells[len(attr2):]
160162

161-
if len(t2.matrix_cells) == 0 or len(t1.matrix_cells) == 0:
163+
if len(t2_update.matrix_cells) == 0 or len(t1.matrix_cells) == 0:
162164
return False
163165

164-
TableAttributeExtractor.clear_attributes(t2.matrix_cells)
166+
TableAttributeExtractor.clear_attributes(t2_update.matrix_cells)
165167

166168
# condition 3. Number of columns should be equal
167-
if len(t1.matrix_cells[-1]) != len(t2.matrix_cells[0]):
169+
if len(t1.matrix_cells[-1]) != len(t2_update.matrix_cells[0]):
168170
if self.config.get("debug_mode", False):
169171
self.logger.debug("Different count column")
170172
return False
171173

172174
# condition 4. Comparison of the widths of last and first rows
173-
if not self.__is_equal_width_cells(t1.matrix_cells, t2.matrix_cells):
175+
if t1.check_on_cell_instance() and t2_update.check_on_cell_instance() and not self.__is_equal_width_cells(t1.matrix_cells, t2_update.matrix_cells):
174176
if self.config.get("debug_mode", False):
175177
self.logger.debug("Different width columns")
176178
return False
177179

180+
t2.matrix_cells = copy.deepcopy(t2_update.matrix_cells) # save changes
178181
return True

dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py

+17
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,22 @@ def is_equal_attributes(attr1: List[List[Cell]], attr2: List[List[Cell]], thr_si
3131

3232
return True
3333

34+
@staticmethod
35+
def check_have_attributes(matrix_table: List[List[Cell]]) -> bool:
36+
if len(matrix_table) == 0:
37+
return False
38+
if len(matrix_table[0]) == 0:
39+
return False
40+
if not hasattr(matrix_table[0][0], "is_attribute"):
41+
return False
42+
return True
43+
3444
@staticmethod
3545
def get_header_table(matrix_table: List[List[Cell]]) -> List[List[Cell]]:
46+
47+
if not TableAttributeExtractor.check_have_attributes(matrix_table):
48+
return matrix_table[:1]
49+
3650
header_rows = len(matrix_table)
3751
for (i, row) in enumerate(matrix_table):
3852
attrs = [cell for cell in row if cell.is_attribute]
@@ -44,6 +58,9 @@ def get_header_table(matrix_table: List[List[Cell]]) -> List[List[Cell]]:
4458

4559
@staticmethod
4660
def clear_attributes(matrix_table: List[List[Cell]]) -> None:
61+
if not TableAttributeExtractor.check_have_attributes(matrix_table):
62+
return
63+
4764
for row in matrix_table:
4865
for cell in row:
4966
cell.is_attribute = False

dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py

+30-32
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
6363
warnings = []
6464

6565
with tempfile.TemporaryDirectory() as tmp_dir:
66-
lines, tables, tables_on_images, attachments, document_metadata = self.__extract(
67-
path=file_path,
68-
parameters=parameters,
69-
warnings=warnings,
70-
tmp_dir=tmp_dir
71-
)
72-
lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=attachments)
66+
lines, tables, attachments, document_metadata = self.__extract(path=file_path, parameters=parameters, warnings=warnings, tmp_dir=tmp_dir)
7367

7468
if get_param_with_attachments(parameters) and self.attachment_extractor.can_extract(file_path):
7569
attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters)
@@ -80,15 +74,15 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
8074

8175
return self._postprocess(result)
8276

83-
def __extract(self, path: str, parameters: dict, warnings: list, tmp_dir: str)\
84-
-> Tuple[List[LineWithMeta], List[Table], List[ScanTable], List[PdfImageAttachment], Optional[dict]]:
77+
def __extract(self, path: str, parameters: dict, warnings: List[str], tmp_dir: str)\
78+
-> Tuple[List[LineWithMeta], List[Table], List[PdfImageAttachment], Optional[dict]]:
8579
import math
8680
from dedoc.utils.pdf_utils import get_pdf_page_count
8781
from dedoc.utils.utils import calculate_file_hash
8882
from dedoc.utils.parameter_utils import get_param_page_slice, get_param_with_attachments
8983
from dedoc.utils.parameter_utils import get_param_need_gost_frame_analysis
9084

91-
all_lines, all_tables, all_tables_on_images, all_attached_images = [], [], [], []
85+
all_lines, all_tables, all_scan_tables, all_attached_images = [], [], [], []
9286
with_attachments = get_param_with_attachments(parameters)
9387
document_metadata = None
9488

@@ -106,12 +100,11 @@ def __extract(self, path: str, parameters: dict, warnings: list, tmp_dir: str)\
106100
document_metadata["last_page"] = last_page
107101

108102
if empty_page_limit:
109-
return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata
103+
return all_lines, all_tables, all_attached_images, document_metadata
110104

111-
gost_json_path = ""
112-
remove_frame = get_param_need_gost_frame_analysis(parameters)
113-
if remove_frame:
114-
gost_json_path = self.__save_gost_frame_boxes_to_json(first_page=first_page, last_page=last_page, page_count=page_count, tmp_dir=tmp_dir, path=path)
105+
remove_gost_frame = get_param_need_gost_frame_analysis(parameters)
106+
gost_json_path = self.__save_gost_frame_boxes_to_json(first_page=first_page, last_page=last_page, page_count=page_count, tmp_dir=tmp_dir, path=path) \
107+
if remove_gost_frame else ""
115108

116109
# in java tabby reader page numeration starts with 1, end_page is included
117110
first_tabby_page = first_page + 1 if first_page is not None else 1
@@ -122,24 +115,26 @@ def __extract(self, path: str, parameters: dict, warnings: list, tmp_dir: str)\
122115
end_page=last_tabby_page,
123116
tmp_dir=tmp_dir,
124117
gost_json_path=gost_json_path,
125-
remove_frame=remove_frame)
118+
remove_frame=remove_gost_frame)
126119

127120
pages = document.get("pages", [])
128121
for page in pages:
129122
page_lines = self.__get_lines_with_location(page, file_hash)
130123
if page_lines:
131124
all_lines.extend(page_lines)
132-
page_tables, table_on_images = self.__get_tables(page)
133-
assert len(page_tables) == len(table_on_images)
134-
if page_tables:
135-
all_tables.extend(page_tables)
136-
all_tables_on_images.extend(table_on_images)
125+
scan_tables = self.__get_tables(page)
126+
all_scan_tables.extend(scan_tables)
137127

138128
attached_images = self.__get_attached_images(page=page, parameters=parameters, path=path) if with_attachments else []
139129
if attached_images:
140130
all_attached_images.extend(attached_images)
141131

142-
return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata
132+
mp_tables = self.table_recognizer.convert_to_multipages_tables(all_scan_tables, lines_with_meta=all_lines)
133+
all_lines = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=all_attached_images)
134+
135+
tables = [scan_table.to_table() for scan_table in mp_tables]
136+
137+
return all_lines, tables, all_attached_images, document_metadata
143138

144139
def __save_gost_frame_boxes_to_json(self, first_page: Optional[int], last_page: Optional[int], page_count: int, path: str, tmp_dir: str) -> str:
145140
from joblib import Parallel, delayed
@@ -150,24 +145,25 @@ def __save_gost_frame_boxes_to_json(self, first_page: Optional[int], last_page:
150145
images = self._get_images(path, first_page, last_page)
151146

152147
gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
148+
153149
result_dict = {
154150
page_number: {**page_data[1].to_dict(), **{"original_image_width": page_data[2][1], "original_image_height": page_data[2][0]}}
155151
for page_number, page_data in enumerate(gost_analyzed_images, start=first_page)
156152
}
153+
157154
result_json_path = os.path.join(tmp_dir, "gost_frame_bboxes.json")
158155
with open(result_json_path, "w") as f:
159156
json.dump(result_dict, f)
157+
160158
return result_json_path
161159

162-
def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
160+
def __get_tables(self, page: dict) -> List[ScanTable]:
163161
import uuid
164162
from dedoc.data_structures.cell_with_meta import CellWithMeta
165163
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
166164
from dedoc.data_structures.line_metadata import LineMetadata
167-
from dedoc.data_structures.table_metadata import TableMetadata
168165

169-
tables = []
170-
tables_on_image = []
166+
scan_tables = []
171167
page_number = page["number"]
172168
page_width = int(page["width"])
173169
page_height = int(page["height"])
@@ -179,7 +175,7 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
179175
cell_properties = table["cell_properties"]
180176
assert len(rows) == len(cell_properties)
181177

182-
result_cells = []
178+
cells = []
183179
for num_row, row in enumerate(rows):
184180
assert len(row) == len(cell_properties[num_row])
185181

@@ -191,20 +187,22 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
191187
for c in cell_blocks:
192188
cell_bbox = BBox(x_top_left=int(c["x_top_left"]), y_top_left=int(c["y_top_left"]), width=int(c["width"]), height=int(c["height"]))
193189
annotations.append(BBoxAnnotation(c["start"], c["end"], cell_bbox, page_width=page_width, page_height=page_height))
190+
"""
191+
TODO: change to Cell class after tabby can return cell coordinates. Then set type Cell in class "ScanTable"
192+
https://jira.intra.ispras.ru/browse/TLDR-851
193+
"""
194194

195195
result_row.append(CellWithMeta(
196196
lines=[LineWithMeta(line=cell["text"], metadata=LineMetadata(page_id=page_number, line_id=0), annotations=annotations)],
197197
colspan=cell_properties[num_row][num_col]["col_span"],
198198
rowspan=cell_properties[num_row][num_col]["row_span"],
199199
invisible=bool(cell_properties[num_row][num_col]["invisible"])
200200
))
201-
result_cells.append(result_row)
201+
cells.append(result_row)
202202

203-
table_name = str(uuid.uuid4())
204-
tables.append(Table(cells=result_cells, metadata=TableMetadata(page_id=page_number, uid=table_name)))
205-
tables_on_image.append(ScanTable(page_number=page_number, matrix_cells=None, bbox=table_bbox, name=table_name, order=order))
203+
scan_tables.append(ScanTable(page_number=page_number, matrix_cells=cells, bbox=table_bbox, name=str(uuid.uuid4()), order=order))
206204

207-
return tables, tables_on_image
205+
return scan_tables
208206

209207
def __get_attached_images(self, page: dict, parameters: dict, path: str) -> List[PdfImageAttachment]:
210208
import os

0 commit comments

Comments
 (0)