diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 6bf31bd2..3492e1a7 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -10,6 +10,7 @@ from dedoc.common.exceptions.java_not_found_error import JavaNotFoundError from dedoc.common.exceptions.tabby_pdf_error import TabbyPdfError from dedoc.data_structures.bbox import BBox +from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation @@ -148,16 +149,18 @@ def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]: def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]: lines = [] page_number = page["number"] + page_width = int(page["width"]) + page_height = int(page["height"]) prev_line = None for block in page["blocks"]: annotations = [] order = block["order"] block_text = block["text"] - bx_top_left = block["x_top_left"] - by_top_left = block["y_top_left"] - bx_bottom_right = bx_top_left + block["width"] - by_bottom_right = by_top_left + block["height"] + bx_top_left = int(block["x_top_left"]) + by_top_left = int(block["y_top_left"]) + bx_bottom_right = bx_top_left + int(block["width"]) + by_bottom_right = by_top_left + int(block["height"]) indent = block["indent"] spacing = block["spacing"] len_block = len(block_text) @@ -173,7 +176,12 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith url = annotation["url"] start = annotation["start"] end = annotation["end"] - + x_top_left = int(annotation["x_top_left"]) + y_top_left = int(annotation["y_top_left"]) + x_bottom_right = bx_top_left + int(annotation["width"]) + y_bottom_right = by_top_left + int(annotation["height"]) + box = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right)) + annotations.append(BBoxAnnotation(start, end, box, page_width=page_width, page_height=page_height)) annotations.append(SizeAnnotation(start, end, str(font_size))) annotations.append(StyleAnnotation(start, end, font_name)) @@ -189,6 +197,7 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith meta = block["metadata"].lower() uid = f"txt_{file_hash}_{order}" bbox = BBox.from_two_points((bx_top_left, by_top_left), (bx_bottom_right, by_bottom_right)) + annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height)) metadata = LineMetadata(page_id=page_number, line_id=order) line_with_location = LineWithLocation(line=block_text, diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py index 67d66b5a..d9d046e9 100644 --- a/tests/api_tests/test_api_format_pdf_tabby_reader.py +++ b/tests/api_tests/test_api_format_pdf_tabby_reader.py @@ -2,6 +2,9 @@ import unittest from typing import List +from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation +from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation +from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -267,3 +270,13 @@ def test_pdf_with_tables(self) -> None: node = self._get_by_tree_path(tree, "0.4.2") self.assertEqual("list_item", node["metadata"]["paragraph_type"]) self.assertEqual("3. В соответствии с полученной", node["text"].strip()[:30]) + + def test_pdf_annotations(self) -> None: + file_name = "Document635.pdf" + result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby")) + content = result["content"]["structure"]["subparagraphs"] + annotations = content[0]["annotations"] + annotation_names = {annotation["name"] for annotation in annotations} + self.assertIn(BoldAnnotation.name, annotation_names) + self.assertIn(SpacingAnnotation.name, annotation_names) + self.assertIn(BBoxAnnotation.name, annotation_names)