Skip to content

Commit

Permalink
Add BBoxAnnotation to TabbyPDF reader (#312)
Browse files Browse the repository at this point in the history
* Add BBoxAnnotation to TabbyPDF reader

* Fix import and add test

* Remove unused import
  • Loading branch information
sunveil authored Aug 24, 2023
1 parent 9a1f7ff commit 13938e4
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 5 deletions.
19 changes: 14 additions & 5 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from dedoc.common.exceptions.java_not_found_error import JavaNotFoundError
from dedoc.common.exceptions.tabby_pdf_error import TabbyPdfError
from dedoc.data_structures.bbox import BBox
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation
from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation
Expand Down Expand Up @@ -148,16 +149,18 @@ def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]:
lines = []
page_number = page["number"]
page_width = int(page["width"])
page_height = int(page["height"])
prev_line = None

for block in page["blocks"]:
annotations = []
order = block["order"]
block_text = block["text"]
bx_top_left = block["x_top_left"]
by_top_left = block["y_top_left"]
bx_bottom_right = bx_top_left + block["width"]
by_bottom_right = by_top_left + block["height"]
bx_top_left = int(block["x_top_left"])
by_top_left = int(block["y_top_left"])
bx_bottom_right = bx_top_left + int(block["width"])
by_bottom_right = by_top_left + int(block["height"])
indent = block["indent"]
spacing = block["spacing"]
len_block = len(block_text)
Expand All @@ -173,7 +176,12 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
url = annotation["url"]
start = annotation["start"]
end = annotation["end"]

x_top_left = int(annotation["x_top_left"])
y_top_left = int(annotation["y_top_left"])
x_bottom_right = bx_top_left + int(annotation["width"])
y_bottom_right = by_top_left + int(annotation["height"])
box = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))
annotations.append(BBoxAnnotation(start, end, box, page_width=page_width, page_height=page_height))
annotations.append(SizeAnnotation(start, end, str(font_size)))
annotations.append(StyleAnnotation(start, end, font_name))

Expand All @@ -189,6 +197,7 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
meta = block["metadata"].lower()
uid = f"txt_{file_hash}_{order}"
bbox = BBox.from_two_points((bx_top_left, by_top_left), (bx_bottom_right, by_bottom_right))
annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height))

metadata = LineMetadata(page_id=page_number, line_id=order)
line_with_location = LineWithLocation(line=block_text,
Expand Down
13 changes: 13 additions & 0 deletions tests/api_tests/test_api_format_pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import unittest
from typing import List

from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation
from tests.api_tests.abstract_api_test import AbstractTestApiDocReader


Expand Down Expand Up @@ -267,3 +270,13 @@ def test_pdf_with_tables(self) -> None:
node = self._get_by_tree_path(tree, "0.4.2")
self.assertEqual("list_item", node["metadata"]["paragraph_type"])
self.assertEqual("3. В соответствии с полученной", node["text"].strip()[:30])

def test_pdf_annotations(self) -> None:
file_name = "Document635.pdf"
result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby"))
content = result["content"]["structure"]["subparagraphs"]
annotations = content[0]["annotations"]
annotation_names = {annotation["name"] for annotation in annotations}
self.assertIn(BoldAnnotation.name, annotation_names)
self.assertIn(SpacingAnnotation.name, annotation_names)
self.assertIn(BBoxAnnotation.name, annotation_names)

0 comments on commit 13938e4

Please sign in to comment.