From 8b6238e905ba850106d56224c4f6f56e30f63691 Mon Sep 17 00:00:00 2001
From: alexander1999-hub <golodkov.ao@phystech.edu>
Date: Thu, 7 Nov 2024 15:18:06 +0300
Subject: [PATCH 1/5] fixed bboxes

---
 dedoc/readers/pdf_reader/pdf_base_reader.py   |  2 +-
 .../pdf_txtlayer_reader.py                    | 22 +++--
 .../test_api_module_table_recognizer.py       | 88 +++++++++++++++++++
 3 files changed, 102 insertions(+), 10 deletions(-)

diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
index 17816b30..101607ff 100644
--- a/dedoc/readers/pdf_reader/pdf_base_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -161,7 +161,7 @@ def _process_document_with_gost_frame(self, images: Iterator[ndarray], first_pag
         page_range = range(first_page, first_page + len(gost_analyzed_images))
         gost_analyzed_images = dict(zip(page_range, gost_analyzed_images))
         if isinstance(self, PdfTxtlayerReader):
-            self.gost_frame_boxes = dict(zip(page_range, [item[1] for item in gost_analyzed_images.values()]))
+            self.gost_frame_boxes = dict(zip(page_range, [(item[1], item[2]) for item in gost_analyzed_images.values()]))
         result = Parallel(n_jobs=self.config["n_jobs"])(
             delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in
             gost_analyzed_images.items()
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
index 46528fcd..fce751a7 100644
--- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
@@ -4,7 +4,6 @@
 from numpy import ndarray
 
 from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
-from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox
 from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
 from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
 from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader
@@ -60,18 +59,23 @@ def _process_one_page(self,
         if page is None:
             return [], [], [], []
         if parameters.need_gost_frame_analysis:
-            page_shift = self.gost_frame_boxes[page_number]
-            self._move_table_cells(tables=tables, page_shift=page_shift, page=page)
+            page_shift = self.gost_frame_boxes[page_number][0]
+            self._move_table_cells(tables=tables, page_shift=page_shift, page=self.gost_frame_boxes[page_number][1])
+            self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)
             readable_block = page_shift  # bbox representing the content of the gost frame
             page.bboxes = [bbox for bbox in page.bboxes if self._inside_any_unreadable_block(bbox.bbox, [readable_block])]  # exclude boxes outside the frame
-        unreadable_blocks = [location.bbox for table in tables for location in table.locations]
-        page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
-        lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
-        self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)
+            unreadable_blocks = [location.bbox for table in tables for location in table.locations]
+            page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
+            lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
+        else:
+            unreadable_blocks = [location.bbox for table in tables for location in table.locations]
+            page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
+            lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
+            self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)
 
         return lines, tables, page.attachments, []
 
-    def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: PageWithBBox) -> None:
+    def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Tuple) -> None:
         """
         Move tables back to original coordinates when parsing a document containing a gost frame
         """
@@ -81,7 +85,7 @@ def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Pag
                 location.bbox.shift(shift_x=shift_x, shift_y=shift_y)
             for row in table.matrix_cells:
                 for cell in row:
-                    image_width, image_height = page.pdf_page_width, page.pdf_page_height
+                    image_width, image_height = page[1], page[0]
                     cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
 
     def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None:
diff --git a/tests/api_tests/test_api_module_table_recognizer.py b/tests/api_tests/test_api_module_table_recognizer.py
index a1e48a78..d5acda08 100644
--- a/tests/api_tests/test_api_module_table_recognizer.py
+++ b/tests/api_tests/test_api_module_table_recognizer.py
@@ -1,3 +1,4 @@
+import json
 import os
 import unittest
 from typing import List
@@ -214,22 +215,91 @@ def test_detect_small_table(self) -> None:
         tables = result["content"]["tables"]
         self.assertEqual(2, len(tables))
 
+    def _test_bbox_annotations(self, node: dict, target_dict: dict) -> None:
+        annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "bounding box"]
+        annotations_dict = json.loads(annotations[0]["value"])
+        for key in target_dict:
+            self.assertAlmostEqual(float(annotations_dict[key]), target_dict[key], None, None, delta=0.05)
+
     def test_multipage_gost_table(self) -> None:
         file_name = "gost_multipage_table.pdf"
         result = self._send_request(file_name, data={"need_gost_frame_analysis": "True"})  # don't pass pdf_with_text_layer to check condition in PDFBaseReader
         self.assertTrue(len(result["content"]["tables"][0]["cells"]) > 35)
         self.assertTrue("KR13" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"])  # check the last row of multipage table
+        target_bbox_dict_1 = {
+            "x_top_left": 0.15,
+            "y_top_left": 0.58,
+            "width": 0.04,
+            "height": 0.009,
+            "page_width": 1653,
+            "page_height": 2339
+        }
+        self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][0]["lines"][0], target_bbox_dict_1)
         self.assertTrue("R13.1" in result["content"]["tables"][0]["cells"][-1][1]["lines"][0]["text"])  # check that it belongs to first and only table
+        target_bbox_dict_2 = {
+            "x_top_left": 0.25,
+            "y_top_left": 0.58,
+            "width": 0.04,
+            "height": 0.009,
+            "page_width": 1653,
+            "page_height": 2339
+        }
+        self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][1]["lines"][0], target_bbox_dict_2)
         self.assertTrue("Испытание по проверке" in result["content"]["tables"][0]["cells"][-1][2]["lines"][0]["text"])
+        target_bbox_dict_3 = {
+            "x_top_left": 0.33,
+            "y_top_left": 0.58,
+            "width": 0.09,
+            "height": 0.009,
+            "page_width": 1653,
+            "page_height": 2339
+        }
+        self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][2]["lines"][0], target_bbox_dict_3)
         self.assertTrue("3.6" in result["content"]["tables"][0]["cells"][-1][3]["lines"][0]["text"])
+        target_bbox_dict_4 = {
+            "x_top_left": 0.78,
+            "y_top_left": 0.58,
+            "width": 0.02,
+            "height": 0.009,
+            "page_width": 1653,
+            "page_height": 2339
+        }
+        self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][3]["lines"][0], target_bbox_dict_4)
         self.assertTrue("7.4.9" in result["content"]["tables"][0]["cells"][-1][4]["lines"][0]["text"])
+        target_bbox_dict_5 = {
+            "x_top_left": 0.88,
+            "y_top_left": 0.58,
+            "width": 0.03,
+            "height": 0.009,
+            "page_width": 1653,
+            "page_height": 2339
+        }
+        self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][4]["lines"][0], target_bbox_dict_5)
 
     def test_multipage_gost_table_with_text_layer(self) -> None:
         file_name = "gost_multipage_table_2.pdf"
         result = self._send_request(file_name, data={"need_gost_frame_analysis": "True", "pdf_with_text_layer": "True"})
         self.assertEqual(len(result["content"]["tables"][0]["cells"]), 14)
         self.assertTrue("SAMPLE TEXT" in result["content"]["tables"][0]["cells"][0][0]["lines"][0]["text"])
+        target_bbox_dict_1 = {
+            "x_top_left": 0.13,
+            "y_top_left": 0.61,
+            "width": 0.06,
+            "height": 0.007,
+            "page_width": 595,
+            "page_height": 841
+        }
+        self._test_bbox_annotations(result["content"]["tables"][0]["cells"][0][0]["lines"][0], target_bbox_dict_1)
         self.assertTrue("2" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"])
+        target_bbox_dict_2 = {
+            "x_top_left": 0.13,
+            "y_top_left": 0.15,
+            "width": 0.005,
+            "height": 0.007,
+            "page_width": 595,
+            "page_height": 841
+        }
+        self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][0]["lines"][0], target_bbox_dict_2)
         self.assertEqual(len(result["content"]["tables"]), 1)
 
     def test_multipage_gost_table_with_text_layer_and_pages_param(self) -> None:
@@ -238,4 +308,22 @@ def test_multipage_gost_table_with_text_layer_and_pages_param(self) -> None:
         self.assertEqual(len(result["content"]["tables"]), 1)
         self.assertEqual(len(result["content"]["tables"][0]["cells"]), 5)
         self.assertTrue("SAMPLE TEXT" in result["content"]["tables"][0]["cells"][0][0]["lines"][0]["text"])
+        target_bbox_dict_1 = {
+            "x_top_left": 0.13,
+            "y_top_left": 0.07,
+            "width": 0.06,
+            "height": 0.007,
+            "page_width": 595,
+            "page_height": 841
+        }
+        self._test_bbox_annotations(result["content"]["tables"][0]["cells"][0][0]["lines"][0], target_bbox_dict_1)
         self.assertTrue("2" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"])
+        target_bbox_dict_2 = {
+            "x_top_left": 0.13,
+            "y_top_left": 0.15,
+            "width": 0.005,
+            "height": 0.007,
+            "page_width": 595,
+            "page_height": 841
+        }
+        self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][0]["lines"][0], target_bbox_dict_2)

From 5591d627c4bdf9da1c1109ebad392b463d9077b6 Mon Sep 17 00:00:00 2001
From: alexander1999-hub <golodkov.ao@phystech.edu>
Date: Thu, 7 Nov 2024 16:35:58 +0300
Subject: [PATCH 2/5] fixed according to pr comments

---
 .../pdf_txtlayer_reader.py                    | 17 +++--
 .../test_api_module_table_recognizer.py       | 65 ++++++-------------
 2 files changed, 28 insertions(+), 54 deletions(-)

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
index fce751a7..12f1f5e8 100644
--- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
@@ -64,28 +64,27 @@ def _process_one_page(self,
             self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)
             readable_block = page_shift  # bbox representing the content of the gost frame
             page.bboxes = [bbox for bbox in page.bboxes if self._inside_any_unreadable_block(bbox.bbox, [readable_block])]  # exclude boxes outside the frame
-            unreadable_blocks = [location.bbox for table in tables for location in table.locations]
-            page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
-            lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
-        else:
-            unreadable_blocks = [location.bbox for table in tables for location in table.locations]
-            page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
-            lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
+
+        unreadable_blocks = [location.bbox for table in tables for location in table.locations]
+        page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
+        lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
+
+        if not parameters.need_gost_frame_analysis:
             self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)
 
         return lines, tables, page.attachments, []
 
-    def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Tuple) -> None:
+    def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Tuple[int]) -> None:
         """
         Move tables back to original coordinates when parsing a document containing a gost frame
         """
+        image_width, image_height = page[1], page[0]
         for table in tables:
             shift_x, shift_y = page_shift.x_top_left, page_shift.y_top_left  # shift tables to original coordinates
             for location in table.locations:
                 location.bbox.shift(shift_x=shift_x, shift_y=shift_y)
             for row in table.matrix_cells:
                 for cell in row:
-                    image_width, image_height = page[1], page[0]
                     cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
 
     def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None:
diff --git a/tests/api_tests/test_api_module_table_recognizer.py b/tests/api_tests/test_api_module_table_recognizer.py
index d5acda08..dbd3f10f 100644
--- a/tests/api_tests/test_api_module_table_recognizer.py
+++ b/tests/api_tests/test_api_module_table_recognizer.py
@@ -225,6 +225,16 @@ def test_multipage_gost_table(self) -> None:
         file_name = "gost_multipage_table.pdf"
         result = self._send_request(file_name, data={"need_gost_frame_analysis": "True"})  # don't pass pdf_with_text_layer to check condition in PDFBaseReader
         self.assertTrue(len(result["content"]["tables"][0]["cells"]) > 35)
+        target_bbox_dict = {
+            "x_top_left": 0.14,
+            "y_top_left": 0.11,
+            "width": 0.07,
+            "height": 0.01,
+            "page_width": 1653,
+            "page_height": 2339
+        }
+        self._test_bbox_annotations(result["content"]["structure"]["subparagraphs"][0], target_bbox_dict)
+        self.assertTrue("Состав квалификационных испытаний" in result["content"]["structure"]["subparagraphs"][0]["text"])
         self.assertTrue("KR13" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"])  # check the last row of multipage table
         target_bbox_dict_1 = {
             "x_top_left": 0.15,
@@ -236,50 +246,24 @@ def test_multipage_gost_table(self) -> None:
         }
         self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][0]["lines"][0], target_bbox_dict_1)
         self.assertTrue("R13.1" in result["content"]["tables"][0]["cells"][-1][1]["lines"][0]["text"])  # check that it belongs to first and only table
-        target_bbox_dict_2 = {
-            "x_top_left": 0.25,
-            "y_top_left": 0.58,
-            "width": 0.04,
-            "height": 0.009,
-            "page_width": 1653,
-            "page_height": 2339
-        }
-        self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][1]["lines"][0], target_bbox_dict_2)
         self.assertTrue("Испытание по проверке" in result["content"]["tables"][0]["cells"][-1][2]["lines"][0]["text"])
-        target_bbox_dict_3 = {
-            "x_top_left": 0.33,
-            "y_top_left": 0.58,
-            "width": 0.09,
-            "height": 0.009,
-            "page_width": 1653,
-            "page_height": 2339
-        }
-        self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][2]["lines"][0], target_bbox_dict_3)
         self.assertTrue("3.6" in result["content"]["tables"][0]["cells"][-1][3]["lines"][0]["text"])
-        target_bbox_dict_4 = {
-            "x_top_left": 0.78,
-            "y_top_left": 0.58,
-            "width": 0.02,
-            "height": 0.009,
-            "page_width": 1653,
-            "page_height": 2339
-        }
-        self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][3]["lines"][0], target_bbox_dict_4)
         self.assertTrue("7.4.9" in result["content"]["tables"][0]["cells"][-1][4]["lines"][0]["text"])
-        target_bbox_dict_5 = {
-            "x_top_left": 0.88,
-            "y_top_left": 0.58,
-            "width": 0.03,
-            "height": 0.009,
-            "page_width": 1653,
-            "page_height": 2339
-        }
-        self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][4]["lines"][0], target_bbox_dict_5)
 
     def test_multipage_gost_table_with_text_layer(self) -> None:
         file_name = "gost_multipage_table_2.pdf"
         result = self._send_request(file_name, data={"need_gost_frame_analysis": "True", "pdf_with_text_layer": "True"})
         self.assertEqual(len(result["content"]["tables"][0]["cells"]), 14)
+        target_bbox_dict = {
+            "x_top_left": 0.12,
+            "y_top_left": 0.56,
+            "width": 0.01,
+            "height": 0.01,
+            "page_width": 595,
+            "page_height": 841
+        }
+        self._test_bbox_annotations(result["content"]["structure"]["subparagraphs"][0]["subparagraphs"][0], target_bbox_dict)
+        self.assertTrue("Sample text 1" in result["content"]["structure"]["subparagraphs"][0]["subparagraphs"][0]["text"])
         self.assertTrue("SAMPLE TEXT" in result["content"]["tables"][0]["cells"][0][0]["lines"][0]["text"])
         target_bbox_dict_1 = {
             "x_top_left": 0.13,
@@ -291,15 +275,6 @@ def test_multipage_gost_table_with_text_layer(self) -> None:
         }
         self._test_bbox_annotations(result["content"]["tables"][0]["cells"][0][0]["lines"][0], target_bbox_dict_1)
         self.assertTrue("2" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"])
-        target_bbox_dict_2 = {
-            "x_top_left": 0.13,
-            "y_top_left": 0.15,
-            "width": 0.005,
-            "height": 0.007,
-            "page_width": 595,
-            "page_height": 841
-        }
-        self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][0]["lines"][0], target_bbox_dict_2)
         self.assertEqual(len(result["content"]["tables"]), 1)
 
     def test_multipage_gost_table_with_text_layer_and_pages_param(self) -> None:

From 3963f1041d660df3695c3b516dd7c3f214914682 Mon Sep 17 00:00:00 2001
From: alexander1999-hub <golodkov.ao@phystech.edu>
Date: Thu, 7 Nov 2024 16:43:40 +0300
Subject: [PATCH 3/5] fixed according to pr comments

---
 .../pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
index 12f1f5e8..49dd3309 100644
--- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
@@ -74,7 +74,7 @@ def _process_one_page(self,
 
         return lines, tables, page.attachments, []
 
-    def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Tuple[int]) -> None:
+    def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Tuple[int, int]) -> None:
         """
         Move tables back to original coordinates when parsing a document containing a gost frame
         """

From 6cdb2cac82f371deb43a6a2aad852db3dc1936f7 Mon Sep 17 00:00:00 2001
From: Nasty <bogatenkova.anastasiya@mail.ru>
Date: Thu, 7 Nov 2024 16:46:18 +0300
Subject: [PATCH 4/5] Add bboxes drawing for doc with gost frame

---
 scripts/test_words_bbox_extraction.py | 39 +++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/scripts/test_words_bbox_extraction.py b/scripts/test_words_bbox_extraction.py
index 37b4fc36..03e2afa1 100644
--- a/scripts/test_words_bbox_extraction.py
+++ b/scripts/test_words_bbox_extraction.py
@@ -9,7 +9,7 @@
 
 from dedoc.api.dedoc_api import config
 from dedoc.utils.image_utils import rotate_image
-from dedoc.utils.pdf_utils import get_page_image
+from dedoc.utils.pdf_utils import get_page_image, get_pdf_page_count
 from tests.api_tests.abstract_api_test import AbstractTestApiDocReader
 
 BboxWithConfsType = namedtuple("WordWithConf", ["start", "end", "bbox", "confs", "text_type"])
@@ -61,12 +61,15 @@ def __extract_texttype_annotation(self, anns_type: List[dict], ann_bbox: dict, t
 
         return text_type
 
-    def __get_words_annotation(self, structure: dict) -> List[BboxWithConfsType]:
+    def __get_words_annotation(self, structure: dict, page_id: int = 0) -> List[BboxWithConfsType]:
         stack = [structure]
         words_annotation = []
 
         while len(stack) > 0:
             node = stack.pop()
+            if node["metadata"]["page_id"] != page_id:
+                stack.extend(node["subparagraphs"])
+                continue
 
             anns_bbox = [annotation for annotation in node["annotations"] if annotation["name"] == "bounding box"]
             anns_conf = [annotation for annotation in node["annotations"] if annotation["name"] == "confidence"]
@@ -171,6 +174,38 @@ def test_pdf_documents(self) -> None:
                 image = self.__draw_tables_words(tables, image)
             cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image)
 
+    def test_gost_frame_documents(self) -> None:
+        filename_parameters_outputdir = [
+            ["tables/gost_multipage_table_2.pdf", dict(pdf_with_text_layer="true", need_gost_frame_analysis="true"), "gost_frame_true"],
+            ["tables/gost_multipage_table_2.pdf", dict(pdf_with_text_layer="false", need_gost_frame_analysis="true"), "gost_frame_false"]
+        ]
+
+        for file_name, parameters, outputdir in filename_parameters_outputdir:
+            output_path = os.path.join(self.output_path, outputdir)
+            os.makedirs(output_path, exist_ok=True)
+            result = self._send_request(file_name, data=parameters)
+            structure = result["content"]["structure"]
+            tables = result["content"]["tables"]
+            page_count = get_pdf_page_count(self._get_abs_path(file_name))
+
+            for page_id in range(page_count):
+                image = np.asarray(get_page_image(self._get_abs_path(file_name), page_id))
+                word_annotations = self.__get_words_annotation(structure, page_id=page_id)
+                if len(word_annotations) > 0:
+                    ann = word_annotations[0]
+                    if ann is not None:
+                        bbox = json.loads(ann.bbox)
+                        image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC)
+                        image = self.__draw_word_annotations(image, word_annotations)
+                if len(tables) > 0:
+                    if len(word_annotations) == 0:
+                        cell_line = tables[0]["cells"][0][0]["lines"][0]
+                        ann_bbox = [annotation for annotation in cell_line["annotations"] if annotation["name"] == "bounding box"][0]
+                        bbox = json.loads(ann_bbox["value"])
+                        image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC)
+                    image = self.__draw_tables_words(tables, image)
+                cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}_{page_id}.png"), image)
+
     def test_table_word_extraction(self) -> None:
         output_path = os.path.join(self.output_path, "tables")
         os.makedirs(output_path, exist_ok=True)

From 54f5c64b1a994e16342afd5aec1038165ce404f2 Mon Sep 17 00:00:00 2001
From: alexander1999-hub <golodkov.ao@phystech.edu>
Date: Thu, 7 Nov 2024 18:16:46 +0300
Subject: [PATCH 5/5] fixed according to pr comments

---
 .../pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
index 49dd3309..d277815b 100644
--- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
@@ -78,7 +78,7 @@ def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Tup
         """
         Move tables back to original coordinates when parsing a document containing a gost frame
         """
-        image_width, image_height = page[1], page[0]
+        image_height, image_width = page
         for table in tables:
             shift_x, shift_y = page_shift.x_top_left, page_shift.y_top_left  # shift tables to original coordinates
             for location in table.locations: