diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 17816b30..101607ff 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -161,7 +161,7 @@ def _process_document_with_gost_frame(self, images: Iterator[ndarray], first_pag page_range = range(first_page, first_page + len(gost_analyzed_images)) gost_analyzed_images = dict(zip(page_range, gost_analyzed_images)) if isinstance(self, PdfTxtlayerReader): - self.gost_frame_boxes = dict(zip(page_range, [item[1] for item in gost_analyzed_images.values()])) + self.gost_frame_boxes = dict(zip(page_range, [(item[1], item[2]) for item in gost_analyzed_images.values()])) result = Parallel(n_jobs=self.config["n_jobs"])( delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in gost_analyzed_images.items() diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index 46528fcd..d277815b 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -4,7 +4,6 @@ from numpy import ndarray from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation -from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader @@ -60,28 +59,32 @@ def _process_one_page(self, if page is None: return [], [], [], [] if parameters.need_gost_frame_analysis: - page_shift = self.gost_frame_boxes[page_number] - self._move_table_cells(tables=tables, page_shift=page_shift, page=page) + page_shift = self.gost_frame_boxes[page_number][0] + self._move_table_cells(tables=tables, page_shift=page_shift, page=self.gost_frame_boxes[page_number][1]) + self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables) readable_block = page_shift # bbox representing the content of the gost frame page.bboxes = [bbox for bbox in page.bboxes if self._inside_any_unreadable_block(bbox.bbox, [readable_block])] # exclude boxes outside the frame + unreadable_blocks = [location.bbox for table in tables for location in table.locations] page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)] lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False) - self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables) + + if not parameters.need_gost_frame_analysis: + self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables) return lines, tables, page.attachments, [] - def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: PageWithBBox) -> None: + def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Tuple[int, int]) -> None: """ Move tables back to original coordinates when parsing a document containing a gost frame """ + image_height, image_width = page for table in tables: shift_x, shift_y = page_shift.x_top_left, page_shift.y_top_left # shift tables to original coordinates for location in table.locations: location.bbox.shift(shift_x=shift_x, shift_y=shift_y) for row in table.matrix_cells: for cell in row: - image_width, image_height = page.pdf_page_width, page.pdf_page_height cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height) def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None: diff --git a/scripts/test_words_bbox_extraction.py b/scripts/test_words_bbox_extraction.py index 37b4fc36..03e2afa1 100644 --- a/scripts/test_words_bbox_extraction.py +++ b/scripts/test_words_bbox_extraction.py @@ -9,7 +9,7 @@ from dedoc.api.dedoc_api import config from dedoc.utils.image_utils import rotate_image -from dedoc.utils.pdf_utils import get_page_image +from dedoc.utils.pdf_utils import get_page_image, get_pdf_page_count from tests.api_tests.abstract_api_test import AbstractTestApiDocReader BboxWithConfsType = namedtuple("WordWithConf", ["start", "end", "bbox", "confs", "text_type"]) @@ -61,12 +61,15 @@ def __extract_texttype_annotation(self, anns_type: List[dict], ann_bbox: dict, t return text_type - def __get_words_annotation(self, structure: dict) -> List[BboxWithConfsType]: + def __get_words_annotation(self, structure: dict, page_id: int = 0) -> List[BboxWithConfsType]: stack = [structure] words_annotation = [] while len(stack) > 0: node = stack.pop() + if node["metadata"]["page_id"] != page_id: + stack.extend(node["subparagraphs"]) + continue anns_bbox = [annotation for annotation in node["annotations"] if annotation["name"] == "bounding box"] anns_conf = [annotation for annotation in node["annotations"] if annotation["name"] == "confidence"] @@ -171,6 +174,38 @@ def test_pdf_documents(self) -> None: image = self.__draw_tables_words(tables, image) cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image) + def test_gost_frame_documents(self) -> None: + filename_parameters_outputdir = [ + ["tables/gost_multipage_table_2.pdf", dict(pdf_with_text_layer="true", need_gost_frame_analysis="true"), "gost_frame_true"], + ["tables/gost_multipage_table_2.pdf", dict(pdf_with_text_layer="false", need_gost_frame_analysis="true"), "gost_frame_false"] + ] + + for file_name, parameters, outputdir in filename_parameters_outputdir: + output_path = os.path.join(self.output_path, outputdir) + os.makedirs(output_path, exist_ok=True) + result = self._send_request(file_name, data=parameters) + structure = result["content"]["structure"] + tables = result["content"]["tables"] + page_count = get_pdf_page_count(self._get_abs_path(file_name)) + + for page_id in range(page_count): + image = np.asarray(get_page_image(self._get_abs_path(file_name), page_id)) + word_annotations = self.__get_words_annotation(structure, page_id=page_id) + if len(word_annotations) > 0: + ann = word_annotations[0] + if ann is not None: + bbox = json.loads(ann.bbox) + image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC) + image = self.__draw_word_annotations(image, word_annotations) + if len(tables) > 0: + if len(word_annotations) == 0: + cell_line = tables[0]["cells"][0][0]["lines"][0] + ann_bbox = [annotation for annotation in cell_line["annotations"] if annotation["name"] == "bounding box"][0] + bbox = json.loads(ann_bbox["value"]) + image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC) + image = self.__draw_tables_words(tables, image) + cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}_{page_id}.png"), image) + def test_table_word_extraction(self) -> None: output_path = os.path.join(self.output_path, "tables") os.makedirs(output_path, exist_ok=True) diff --git a/tests/api_tests/test_api_module_table_recognizer.py b/tests/api_tests/test_api_module_table_recognizer.py index a1e48a78..dbd3f10f 100644 --- a/tests/api_tests/test_api_module_table_recognizer.py +++ b/tests/api_tests/test_api_module_table_recognizer.py @@ -1,3 +1,4 @@ +import json import os import unittest from typing import List @@ -214,11 +215,36 @@ def test_detect_small_table(self) -> None: tables = result["content"]["tables"] self.assertEqual(2, len(tables)) + def _test_bbox_annotations(self, node: dict, target_dict: dict) -> None: + annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "bounding box"] + annotations_dict = json.loads(annotations[0]["value"]) + for key in target_dict: + self.assertAlmostEqual(float(annotations_dict[key]), target_dict[key], None, None, delta=0.05) + def test_multipage_gost_table(self) -> None: file_name = "gost_multipage_table.pdf" result = self._send_request(file_name, data={"need_gost_frame_analysis": "True"}) # don't pass pdf_with_text_layer to check condition in PDFBaseReader self.assertTrue(len(result["content"]["tables"][0]["cells"]) > 35) + target_bbox_dict = { + "x_top_left": 0.14, + "y_top_left": 0.11, + "width": 0.07, + "height": 0.01, + "page_width": 1653, + "page_height": 2339 + } + self._test_bbox_annotations(result["content"]["structure"]["subparagraphs"][0], target_bbox_dict) + self.assertTrue("Состав квалификационных испытаний" in result["content"]["structure"]["subparagraphs"][0]["text"]) self.assertTrue("KR13" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"]) # check the last row of multipage table + target_bbox_dict_1 = { + "x_top_left": 0.15, + "y_top_left": 0.58, + "width": 0.04, + "height": 0.009, + "page_width": 1653, + "page_height": 2339 + } + self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][0]["lines"][0], target_bbox_dict_1) self.assertTrue("R13.1" in result["content"]["tables"][0]["cells"][-1][1]["lines"][0]["text"]) # check that it belongs to first and only table self.assertTrue("Испытание по проверке" in result["content"]["tables"][0]["cells"][-1][2]["lines"][0]["text"]) self.assertTrue("3.6" in result["content"]["tables"][0]["cells"][-1][3]["lines"][0]["text"]) @@ -228,7 +254,26 @@ def test_multipage_gost_table_with_text_layer(self) -> None: file_name = "gost_multipage_table_2.pdf" result = self._send_request(file_name, data={"need_gost_frame_analysis": "True", "pdf_with_text_layer": "True"}) self.assertEqual(len(result["content"]["tables"][0]["cells"]), 14) + target_bbox_dict = { + "x_top_left": 0.12, + "y_top_left": 0.56, + "width": 0.01, + "height": 0.01, + "page_width": 595, + "page_height": 841 + } + self._test_bbox_annotations(result["content"]["structure"]["subparagraphs"][0]["subparagraphs"][0], target_bbox_dict) + self.assertTrue("Sample text 1" in result["content"]["structure"]["subparagraphs"][0]["subparagraphs"][0]["text"]) self.assertTrue("SAMPLE TEXT" in result["content"]["tables"][0]["cells"][0][0]["lines"][0]["text"]) + target_bbox_dict_1 = { + "x_top_left": 0.13, + "y_top_left": 0.61, + "width": 0.06, + "height": 0.007, + "page_width": 595, + "page_height": 841 + } + self._test_bbox_annotations(result["content"]["tables"][0]["cells"][0][0]["lines"][0], target_bbox_dict_1) self.assertTrue("2" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"]) self.assertEqual(len(result["content"]["tables"]), 1) @@ -238,4 +283,22 @@ def test_multipage_gost_table_with_text_layer_and_pages_param(self) -> None: self.assertEqual(len(result["content"]["tables"]), 1) self.assertEqual(len(result["content"]["tables"][0]["cells"]), 5) self.assertTrue("SAMPLE TEXT" in result["content"]["tables"][0]["cells"][0][0]["lines"][0]["text"]) + target_bbox_dict_1 = { + "x_top_left": 0.13, + "y_top_left": 0.07, + "width": 0.06, + "height": 0.007, + "page_width": 595, + "page_height": 841 + } + self._test_bbox_annotations(result["content"]["tables"][0]["cells"][0][0]["lines"][0], target_bbox_dict_1) self.assertTrue("2" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"]) + target_bbox_dict_2 = { + "x_top_left": 0.13, + "y_top_left": 0.15, + "width": 0.005, + "height": 0.007, + "page_width": 595, + "page_height": 841 + } + self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][0]["lines"][0], target_bbox_dict_2)