From 6cdb2cac82f371deb43a6a2aad852db3dc1936f7 Mon Sep 17 00:00:00 2001 From: Nasty Date: Thu, 7 Nov 2024 16:46:18 +0300 Subject: [PATCH] Add bboxes drawing for doc with gost frame --- scripts/test_words_bbox_extraction.py | 39 +++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/scripts/test_words_bbox_extraction.py b/scripts/test_words_bbox_extraction.py index 37b4fc36..03e2afa1 100644 --- a/scripts/test_words_bbox_extraction.py +++ b/scripts/test_words_bbox_extraction.py @@ -9,7 +9,7 @@ from dedoc.api.dedoc_api import config from dedoc.utils.image_utils import rotate_image -from dedoc.utils.pdf_utils import get_page_image +from dedoc.utils.pdf_utils import get_page_image, get_pdf_page_count from tests.api_tests.abstract_api_test import AbstractTestApiDocReader BboxWithConfsType = namedtuple("WordWithConf", ["start", "end", "bbox", "confs", "text_type"]) @@ -61,12 +61,15 @@ def __extract_texttype_annotation(self, anns_type: List[dict], ann_bbox: dict, t return text_type - def __get_words_annotation(self, structure: dict) -> List[BboxWithConfsType]: + def __get_words_annotation(self, structure: dict, page_id: int = 0) -> List[BboxWithConfsType]: stack = [structure] words_annotation = [] while len(stack) > 0: node = stack.pop() + if node["metadata"]["page_id"] != page_id: + stack.extend(node["subparagraphs"]) + continue anns_bbox = [annotation for annotation in node["annotations"] if annotation["name"] == "bounding box"] anns_conf = [annotation for annotation in node["annotations"] if annotation["name"] == "confidence"] @@ -171,6 +174,38 @@ def test_pdf_documents(self) -> None: image = self.__draw_tables_words(tables, image) cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image) + def test_gost_frame_documents(self) -> None: + filename_parameters_outputdir = [ + ["tables/gost_multipage_table_2.pdf", dict(pdf_with_text_layer="true", need_gost_frame_analysis="true"), "gost_frame_true"], + ["tables/gost_multipage_table_2.pdf", dict(pdf_with_text_layer="false", need_gost_frame_analysis="true"), "gost_frame_false"] + ] + + for file_name, parameters, outputdir in filename_parameters_outputdir: + output_path = os.path.join(self.output_path, outputdir) + os.makedirs(output_path, exist_ok=True) + result = self._send_request(file_name, data=parameters) + structure = result["content"]["structure"] + tables = result["content"]["tables"] + page_count = get_pdf_page_count(self._get_abs_path(file_name)) + + for page_id in range(page_count): + image = np.asarray(get_page_image(self._get_abs_path(file_name), page_id)) + word_annotations = self.__get_words_annotation(structure, page_id=page_id) + if len(word_annotations) > 0: + ann = word_annotations[0] + if ann is not None: + bbox = json.loads(ann.bbox) + image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC) + image = self.__draw_word_annotations(image, word_annotations) + if len(tables) > 0: + if len(word_annotations) == 0: + cell_line = tables[0]["cells"][0][0]["lines"][0] + ann_bbox = [annotation for annotation in cell_line["annotations"] if annotation["name"] == "bounding box"][0] + bbox = json.loads(ann_bbox["value"]) + image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC) + image = self.__draw_tables_words(tables, image) + cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}_{page_id}.png"), image) + def test_table_word_extraction(self) -> None: output_path = os.path.join(self.output_path, "tables") os.makedirs(output_path, exist_ok=True)