Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tabby remove frame #501

Merged
merged 16 commits into from
Nov 14, 2024
6 changes: 3 additions & 3 deletions dedoc/data_structures/cell_with_meta.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List
from typing import List, Optional

from dedoc.api.schema.cell_with_meta import CellWithMeta as ApiCellWithMeta
from dedoc.data_structures.annotation import Annotation
Expand All @@ -20,14 +20,14 @@ class CellWithMeta(Serializable):
:vartype rowspan: int
:vartype invisible: bool
"""
def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
def __init__(self, lines: Optional[List[LineWithMeta]], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
"""
:param lines: textual lines of the cell
:param colspan: number of columns to span like in HTML format
:param rowspan: number of rows to span like in HTML format
:param invisible: indicator for displaying or hiding cell text
"""
self.lines: List[LineWithMeta] = lines
self.lines: List[LineWithMeta] = [] if lines is None else lines
self.colspan: int = colspan
self.rowspan: int = rowspan
self.invisible: bool = invisible
Expand Down
25 changes: 9 additions & 16 deletions dedoc/readers/pdf_reader/data_classes/tables/cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
from dedocutils.data_structures import BBox

from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.cell_with_meta import CellWithMeta
from dedoc.data_structures.line_with_meta import LineWithMeta


class Cell:
class Cell(CellWithMeta):

@staticmethod
def copy_from(cell: "Cell",
Expand Down Expand Up @@ -41,35 +42,27 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int)
if self.con_coord:
self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)

def __init__(self,
x_top_left: int,
x_bottom_right: int,
y_top_left: int,
y_bottom_right: int,
id_con: int = -1,
lines: Optional[List[LineWithMeta]] = None,
is_attribute: bool = False,
is_attribute_required: bool = False,
rotated_angle: int = 0,
uid: str = None,
def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None,
contour_coord: Optional[BBox] = None) -> None:

import uuid

assert x_top_left <= x_bottom_right
assert y_top_left <= y_bottom_right

self.lines = [] if lines is None else lines
super().__init__(lines)

self.x_top_left = x_top_left
self.x_bottom_right = x_bottom_right
self.y_top_left = y_top_left
self.y_bottom_right = y_bottom_right
self.id_con = id_con
self.lines = [] if lines is None else lines
self.is_attribute = is_attribute
self.is_attribute_required = is_attribute_required
self.rotated_angle = rotated_angle
self.cell_uid = f"cell_{uuid.uuid1()}" if uid is None else uid
self.colspan = 1
self.rowspan = 1
self.invisible = False
self.con_coord = contour_coord or BBox(0, 0, 0, 0)

def __str__(self) -> str:
Expand Down
1 change: 1 addition & 0 deletions dedoc/readers/pdf_reader/data_classes/tables/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle:
self.page_number = page_number
self.bbox = bbox
self.name = name
# TODO put self.order (change LineWithLocation, PdfImageAttachment, ScanTable)
self.rotated_angle = rotated_angle

def shift(self, shift_x: int, shift_y: int) -> None:
Expand Down
14 changes: 12 additions & 2 deletions dedoc/readers/pdf_reader/data_classes/tables/scantable.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, List
from typing import Any, List, Optional

from dedocutils.data_structures import BBox

Expand All @@ -10,7 +10,8 @@


class ScanTable:
def __init__(self, page_number: int, matrix_cells: List[List[Cell]] = None, bbox: BBox = None, name: str = "", order: int = -1) -> None:
def __init__(self, page_number: int, matrix_cells: Optional[List[List[CellWithMeta]]] = None, bbox: Optional[BBox] = None,
name: str = "", order: int = -1) -> None:
self.matrix_cells = matrix_cells
self.page_number = page_number
self.locations = []
Expand All @@ -27,6 +28,15 @@ def extended(self, table: "ScanTable") -> None:
# extend order
self.order = max(self.order, table.order)

def check_on_cell_instance(self) -> bool:
if len(self.matrix_cells) == 0:
return False
if len(self.matrix_cells[0]) == 0:
return False
if not isinstance(self.matrix_cells[0][0], Cell):
return False
return True

def to_table(self) -> Table:
metadata = TableMetadata(page_id=self.page_number, uid=self.name, rotated_angle=self.location.rotated_angle)
cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in self.matrix_cells]
Expand Down
1 change: 1 addition & 0 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable


ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
"orient_analysis_cells",
"orient_cell_angle",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import logging
from typing import List

Expand Down Expand Up @@ -155,24 +156,26 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool:
# condition 2. Exclusion of the duplicated header (if any)
attr1 = TableAttributeExtractor.get_header_table(t1.matrix_cells)
attr2 = TableAttributeExtractor.get_header_table(t2.matrix_cells)
t2_update = copy.deepcopy(t2)
if TableAttributeExtractor.is_equal_attributes(attr1, attr2):
t2.matrix_cells = t2.matrix_cells[len(attr2):]
t2_update.matrix_cells = t2_update.matrix_cells[len(attr2):]

if len(t2.matrix_cells) == 0 or len(t1.matrix_cells) == 0:
if len(t2_update.matrix_cells) == 0 or len(t1.matrix_cells) == 0:
return False

TableAttributeExtractor.clear_attributes(t2.matrix_cells)
TableAttributeExtractor.clear_attributes(t2_update.matrix_cells)

# condition 3. Number of columns should be equal
if len(t1.matrix_cells[-1]) != len(t2.matrix_cells[0]):
if len(t1.matrix_cells[-1]) != len(t2_update.matrix_cells[0]):
if self.config.get("debug_mode", False):
self.logger.debug("Different count column")
return False

# condition 4. Comparison of the widths of last and first rows
if not self.__is_equal_width_cells(t1.matrix_cells, t2.matrix_cells):
if t1.check_on_cell_instance() and t2_update.check_on_cell_instance() and not self.__is_equal_width_cells(t1.matrix_cells, t2_update.matrix_cells):
if self.config.get("debug_mode", False):
self.logger.debug("Different width columns")
return False

t2.matrix_cells = copy.deepcopy(t2_update.matrix_cells) # save changes
return True
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,22 @@ def is_equal_attributes(attr1: List[List[Cell]], attr2: List[List[Cell]], thr_si

return True

@staticmethod
def check_have_attributes(matrix_table: List[List[Cell]]) -> bool:
if len(matrix_table) == 0:
return False
if len(matrix_table[0]) == 0:
return False
if not hasattr(matrix_table[0][0], "is_attribute"):
return False
return True

@staticmethod
def get_header_table(matrix_table: List[List[Cell]]) -> List[List[Cell]]:

if not TableAttributeExtractor.check_have_attributes(matrix_table):
return matrix_table[:1]

header_rows = len(matrix_table)
for (i, row) in enumerate(matrix_table):
attrs = [cell for cell in row if cell.is_attribute]
Expand All @@ -44,6 +58,9 @@ def get_header_table(matrix_table: List[List[Cell]]) -> List[List[Cell]]:

@staticmethod
def clear_attributes(matrix_table: List[List[Cell]]) -> None:
if not TableAttributeExtractor.check_have_attributes(matrix_table):
return

for row in matrix_table:
for cell in row:
cell.is_attribute = False
Expand Down
107 changes: 76 additions & 31 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os.path
from typing import List, Optional, Tuple

from dedocutils.data_structures import BBox
Expand Down Expand Up @@ -62,13 +63,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
warnings = []

with tempfile.TemporaryDirectory() as tmp_dir:
lines, tables, tables_on_images, attachments, document_metadata = self.__extract(
path=file_path,
parameters=parameters,
warnings=warnings,
tmp_dir=tmp_dir
)
lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=attachments)
lines, tables, attachments, document_metadata = self.__extract(path=file_path, parameters=parameters, warnings=warnings, tmp_dir=tmp_dir)

if get_param_with_attachments(parameters) and self.attachment_extractor.can_extract(file_path):
attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters)
Expand All @@ -79,14 +74,15 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure

return self._postprocess(result)

def __extract(self, path: str, parameters: dict, warnings: list, tmp_dir: str)\
-> Tuple[List[LineWithMeta], List[Table], List[ScanTable], List[PdfImageAttachment], Optional[dict]]:
def __extract(self, path: str, parameters: dict, warnings: List[str], tmp_dir: str)\
-> Tuple[List[LineWithMeta], List[Table], List[PdfImageAttachment], Optional[dict]]:
import math
from dedoc.utils.pdf_utils import get_pdf_page_count
from dedoc.utils.utils import calculate_file_hash
from dedoc.utils.parameter_utils import get_param_page_slice, get_param_with_attachments
from dedoc.utils.parameter_utils import get_param_need_gost_frame_analysis

all_lines, all_tables, all_tables_on_images, all_attached_images = [], [], [], []
all_lines, all_tables, all_scan_tables, all_attached_images = [], [], [], []
with_attachments = get_param_with_attachments(parameters)
document_metadata = None

Expand All @@ -104,40 +100,70 @@ def __extract(self, path: str, parameters: dict, warnings: list, tmp_dir: str)\
document_metadata["last_page"] = last_page

if empty_page_limit:
return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata
return all_lines, all_tables, all_attached_images, document_metadata

remove_gost_frame = get_param_need_gost_frame_analysis(parameters)
gost_json_path = self.__save_gost_frame_boxes_to_json(first_page=first_page, last_page=last_page, page_count=page_count, tmp_dir=tmp_dir, path=path) \
if remove_gost_frame else ""

# in java tabby reader page numeration starts with 1, end_page is included
first_tabby_page = first_page + 1 if first_page is not None else 1
last_tabby_page = page_count if (last_page is None) or (last_page is not None and last_page > page_count) else last_page
self.logger.info(f"Reading PDF pages from {first_tabby_page} to {last_tabby_page}")
document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page, tmp_dir=tmp_dir)
document = self.__process_pdf(path=path,
start_page=first_tabby_page,
end_page=last_tabby_page,
tmp_dir=tmp_dir,
gost_json_path=gost_json_path,
remove_frame=remove_gost_frame)

pages = document.get("pages", [])
for page in pages:
page_lines = self.__get_lines_with_location(page, file_hash)
if page_lines:
all_lines.extend(page_lines)
page_tables, table_on_images = self.__get_tables(page)
assert len(page_tables) == len(table_on_images)
if page_tables:
all_tables.extend(page_tables)
all_tables_on_images.extend(table_on_images)
scan_tables = self.__get_tables(page)
all_scan_tables.extend(scan_tables)

attached_images = self.__get_attached_images(page=page, parameters=parameters, path=path) if with_attachments else []
if attached_images:
all_attached_images.extend(attached_images)

return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata
mp_tables = self.table_recognizer.convert_to_multipages_tables(all_scan_tables, lines_with_meta=all_lines)
all_lines = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=all_attached_images)

tables = [scan_table.to_table() for scan_table in mp_tables]

return all_lines, tables, all_attached_images, document_metadata

def __save_gost_frame_boxes_to_json(self, first_page: Optional[int], last_page: Optional[int], page_count: int, path: str, tmp_dir: str) -> str:
from joblib import Parallel, delayed
import json

first_page = 0 if first_page is None or first_page < 0 else first_page
last_page = page_count if (last_page is None) or (last_page is not None and last_page > page_count) else last_page
images = self._get_images(path, first_page, last_page)

gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)

result_dict = {
page_number: {**page_data[1].to_dict(), **{"original_image_width": page_data[2][1], "original_image_height": page_data[2][0]}}
for page_number, page_data in enumerate(gost_analyzed_images, start=first_page)
}

def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
result_json_path = os.path.join(tmp_dir, "gost_frame_bboxes.json")
with open(result_json_path, "w") as f:
json.dump(result_dict, f)

return result_json_path

def __get_tables(self, page: dict) -> List[ScanTable]:
import uuid
from dedoc.data_structures.cell_with_meta import CellWithMeta
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedoc.data_structures.line_metadata import LineMetadata
from dedoc.data_structures.table_metadata import TableMetadata

tables = []
tables_on_image = []
scan_tables = []
page_number = page["number"]
page_width = int(page["width"])
page_height = int(page["height"])
Expand All @@ -149,7 +175,7 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
cell_properties = table["cell_properties"]
assert len(rows) == len(cell_properties)

result_cells = []
cells = []
for num_row, row in enumerate(rows):
assert len(row) == len(cell_properties[num_row])

Expand All @@ -161,20 +187,22 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
for c in cell_blocks:
cell_bbox = BBox(x_top_left=int(c["x_top_left"]), y_top_left=int(c["y_top_left"]), width=int(c["width"]), height=int(c["height"]))
annotations.append(BBoxAnnotation(c["start"], c["end"], cell_bbox, page_width=page_width, page_height=page_height))
"""
TODO: change to Cell class after tabby can return cell coordinates. Then set type Cell in class "ScanTable"
https://jira.intra.ispras.ru/browse/TLDR-851
"""

result_row.append(CellWithMeta(
lines=[LineWithMeta(line=cell["text"], metadata=LineMetadata(page_id=page_number, line_id=0), annotations=annotations)],
colspan=cell_properties[num_row][num_col]["col_span"],
rowspan=cell_properties[num_row][num_col]["row_span"],
invisible=bool(cell_properties[num_row][num_col]["invisible"])
))
result_cells.append(result_row)
cells.append(result_row)

table_name = str(uuid.uuid4())
tables.append(Table(cells=result_cells, metadata=TableMetadata(page_id=page_number, uid=table_name)))
tables_on_image.append(ScanTable(page_number=page_number, matrix_cells=None, bbox=table_bbox, name=table_name, order=order))
scan_tables.append(ScanTable(page_number=page_number, matrix_cells=cells, bbox=table_bbox, name=str(uuid.uuid4()), order=order))

return tables, tables_on_image
return scan_tables

def __get_attached_images(self, page: dict, parameters: dict, path: str) -> List[PdfImageAttachment]:
import os
Expand Down Expand Up @@ -291,10 +319,20 @@ def __jar_path(self) -> str:
import os
return os.environ.get("TABBY_JAR", self.default_config["JAR_PATH"])

def __run(self, path: str, tmp_dir: str, encoding: str = "utf-8", start_page: int = None, end_page: int = None) -> bytes:
def __run(self,
path: str,
tmp_dir: str,
encoding: str = "utf-8",
start_page: int = None,
end_page: int = None,
remove_frame: bool = False,
gost_json_path: str = ""
) -> bytes:
import subprocess

args = ["java"] + ["-jar", self.__jar_path(), "-i", path, "-tmp", f"{tmp_dir}/"]
if remove_frame:
args += ["-rf", gost_json_path]
if start_page is not None and end_page is not None:
args += ["-sp", str(start_page), "-ep", str(end_page)]
try:
Expand All @@ -307,11 +345,18 @@ def __run(self, path: str, tmp_dir: str, encoding: str = "utf-8", start_page: in
except subprocess.CalledProcessError as e:
raise TabbyPdfError(e.stderr.decode(encoding))

def __process_pdf(self, path: str, tmp_dir: str, start_page: int = None, end_page: int = None) -> dict:
def __process_pdf(self,
path: str,
tmp_dir: str,
start_page: int = None,
end_page: int = None,
gost_json_path: str = "",
remove_frame: bool = False) -> dict:
import json
import os

self.__run(path=path, start_page=start_page, end_page=end_page, tmp_dir=tmp_dir)
self.__run(path=path, start_page=start_page, end_page=end_page, tmp_dir=tmp_dir, remove_frame=remove_frame, gost_json_path=gost_json_path)

with open(os.path.join(tmp_dir, "data.json"), "r") as response:
document = json.load(response)

Expand Down
Binary file not shown.
Loading
Loading