diff --git a/docker/Dockerfile b/Dockerfile similarity index 79% rename from docker/Dockerfile rename to Dockerfile index 43e4be14..a191bd42 100644 --- a/docker/Dockerfile +++ b/Dockerfile @@ -1,11 +1,11 @@ ARG REPOSITORY="docker.io" -FROM dedocproject/baseimg +FROM dedocproject/dedoc_p3.9_base:version_2023_08_28 ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root" ENV RESOURCES_PATH "/dedoc_root/resources" ADD requirements.txt . -RUN pip3 install -r requirements.txt +RUN pip3 install --no-cache-dir -r requirements.txt RUN mkdir /dedoc_root ADD dedoc /dedoc_root/dedoc @@ -17,4 +17,4 @@ RUN python3 /dedoc_root/dedoc/download_models.py ADD tests /dedoc_root/tests ADD resources /dedoc_root/resources -CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"] \ No newline at end of file +CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"] diff --git a/VERSION b/VERSION index 142464bf..027934ea 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.11.0 \ No newline at end of file +0.11.1 \ No newline at end of file diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py index bd765535..a772c95b 100644 --- a/dedoc/api/api_utils.py +++ b/dedoc/api/api_utils.py @@ -219,3 +219,9 @@ def __table2html(table: Table, table2id: Dict[str, int]) -> str: text += "\n" text += "\n" return text + + +def json2txt(paragraph: TreeNode) -> str: + subparagraphs_text = "\n".join([json2txt(subparagraph) for subparagraph in paragraph.subparagraphs]) + text = f"{paragraph.text}\n{subparagraphs_text}" + return text diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py index 370a3cc1..7e295c7a 100644 --- a/dedoc/api/dedoc_api.py +++ b/dedoc/api/dedoc_api.py @@ -10,7 +10,7 @@ import dedoc from dedoc.api.api_args import QueryParameters -from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree +from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt from dedoc.common.exceptions.dedoc_error import DedocError from dedoc.common.exceptions.missing_file_error import MissingFileError from dedoc.config import get_config @@ -76,6 +76,9 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D if return_format == "html": html_content = json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0) return HTMLResponse(content=html_content, status_code=200) + elif return_format == "plain_text": + txt_content = json2txt(paragraph=document_tree.content.structure) + return PlainTextResponse(content=txt_content, status_code=200) elif return_format == "tree": html_content = json2tree(paragraph=document_tree.content.structure) return HTMLResponse(content=html_content, status_code=200) diff --git a/dedoc/api/static/html_eng/form_input.html b/dedoc/api/static/html_eng/form_input.html index cdb39042..e8ab3081 100644 --- a/dedoc/api/static/html_eng/form_input.html +++ b/dedoc/api/static/html_eng/form_input.html @@ -32,6 +32,7 @@

Structure Document Recognition

+ diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 6bf31bd2..e8438f03 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -3,6 +3,7 @@ import math import os import subprocess +from collections import namedtuple from typing import List, Optional, Tuple import numpy as np @@ -10,6 +11,7 @@ from dedoc.common.exceptions.java_not_found_error import JavaNotFoundError from dedoc.common.exceptions.tabby_pdf_error import TabbyPdfError from dedoc.data_structures.bbox import BBox +from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation @@ -33,6 +35,8 @@ from dedoc.utils.parameter_utils import get_param_page_slice from dedoc.utils.utils import calculate_file_hash +CellPropertyInfo = namedtuple("NamedTuple", "colspan, rowspan, invisible") + class PdfTabbyReader(PdfBaseReader): """ @@ -76,7 +80,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - lines, scan_tables = self.__extract(path=path) + lines, scan_tables, tables_cell_properties = self.__extract(path=path) warnings = [] document_metadata = None @@ -93,10 +97,12 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio lines = self.linker.link_objects(lines=lines, tables=scan_tables, images=[]) tables = [] - for scan_table in scan_tables: + assert len(scan_tables) == len(tables_cell_properties) + for scan_table, table_cells_property in zip(scan_tables, tables_cell_properties): + cell_properties = [[cellp for cellp in row] for row in table_cells_property] metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name) cells = [[cell for cell in row] for row in scan_table.matrix_cells] - table = Table(metadata=metadata, cells=cells) + table = Table(metadata=metadata, cells=cells, cells_properties=cell_properties) tables.append(table) attachments = [] @@ -111,23 +117,26 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio return self._postprocess(result) - def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable]]: + def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable], List[List[CellPropertyInfo]]]: file_hash = calculate_file_hash(path=path) document = self.__process_pdf(path=path, start_page=start_page, end_page=end_page) all_lines = [] all_tables = [] + all_cell_properties = [] for page in document.get("pages", []): lines = self.__get_lines_with_location(page, file_hash) if lines: all_lines.extend(lines) - tables = self.__get_tables(page, file_hash) + tables, cell_properties = self.__get_tables(page, file_hash) if tables: all_tables.extend(tables) + all_cell_properties.extend(cell_properties) - return all_lines, all_tables + return all_lines, all_tables, all_cell_properties def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]: tables = [] + cell_properties = [] page_number = page["number"] i = 0 for table in page["tables"]: @@ -138,26 +147,44 @@ def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]: y_bottom_right = y_top_left + table["height"] order = table["order"] rows = table["rows"] + cell_properties_json = table["cell_properties"] + cell_property_list = [] + + for cell_properties_row in cell_properties_json: + cell_property_row_list = [] + + for cell_property in cell_properties_row: + cell_property_info = CellPropertyInfo(cell_property["col_span"], + cell_property["row_span"], + bool(cell_property["invisible"])) + + cell_property_row_list.append(cell_property_info) + + cell_property_list.append(cell_property_row_list) + cells = [row for row in rows] bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right)) tables.append(ScanTable(matrix_cells=cells, page_number=page_number, bbox=bbox, name=file_hash + str(page_number) + str(i), order=order)) + cell_properties.append(cell_property_list) - return tables + return tables, cell_properties def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]: lines = [] page_number = page["number"] + page_width = int(page["width"]) + page_height = int(page["height"]) prev_line = None for block in page["blocks"]: annotations = [] order = block["order"] block_text = block["text"] - bx_top_left = block["x_top_left"] - by_top_left = block["y_top_left"] - bx_bottom_right = bx_top_left + block["width"] - by_bottom_right = by_top_left + block["height"] + bx_top_left = int(block["x_top_left"]) + by_top_left = int(block["y_top_left"]) + bx_bottom_right = bx_top_left + int(block["width"]) + by_bottom_right = by_top_left + int(block["height"]) indent = block["indent"] spacing = block["spacing"] len_block = len(block_text) @@ -173,7 +200,12 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith url = annotation["url"] start = annotation["start"] end = annotation["end"] - + x_top_left = int(annotation["x_top_left"]) + y_top_left = int(annotation["y_top_left"]) + x_bottom_right = bx_top_left + int(annotation["width"]) + y_bottom_right = by_top_left + int(annotation["height"]) + box = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right)) + annotations.append(BBoxAnnotation(start, end, box, page_width=page_width, page_height=page_height)) annotations.append(SizeAnnotation(start, end, str(font_size))) annotations.append(StyleAnnotation(start, end, font_name)) @@ -189,6 +221,7 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith meta = block["metadata"].lower() uid = f"txt_{file_hash}_{order}" bbox = BBox.from_two_points((bx_top_left, by_top_left), (bx_bottom_right, by_bottom_right)) + annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height)) metadata = LineMetadata(page_id=page_number, line_id=order) line_with_location = LineWithLocation(line=block_text, diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index 21bf6943..0f17ce23 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -8,7 +8,7 @@ from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.extractor_pdf_textlayer import ExtractorPdfTextLayer +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox @@ -25,7 +25,7 @@ def __init__(self, *, config: dict) -> None: :param config: configuration of the reader, e.g. logger for logging """ super().__init__(config=config) - self.extractor_layer = ExtractorPdfTextLayer(config=config) + self.extractor_layer = PdfminerExtractor(config=config) def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/__init__.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/extractor_pdf_textlayer.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py similarity index 65% rename from dedoc/readers/pdf_reader/pdf_txtlayer_reader/extractor_pdf_textlayer.py rename to dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py index 5f742f0a..d91d8439 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/extractor_pdf_textlayer.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py @@ -2,23 +2,23 @@ import itertools import logging import os -import re import uuid from collections import namedtuple -from typing import IO, List, Match, Optional, Tuple +from typing import List, Optional, Tuple import cv2 import numpy as np from PIL import Image from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTAnno, LTChar, LTContainer, LTCurve, LTFigure, LTImage, LTRect, LTTextBox, LTTextBoxHorizontal, LTTextLineHorizontal +from pdfminer.layout import LAParams, LTAnno, LTChar, LTContainer, LTCurve, LTFigure, LTImage, LTRect +from pdfminer.layout import LTTextBox, LTTextBoxHorizontal, LTTextContainer, LTTextLineHorizontal from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfpage import PDFPage from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.annotation import Annotation -from dedoc.data_structures.bbox import BBox +from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation @@ -27,13 +27,14 @@ from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.location import Location from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_utils import cleaning_text_from_hieroglyphics, create_bbox, draw_annotation from dedoc.utils.pdf_utils import get_page_image -StyleLine = namedtuple("StyleLine", ["begin", "end", "bold", "italic", "font_size", "font_style", "table_name"]) logging.getLogger("pdfminer").setLevel(logging.ERROR) +WordObj = namedtuple("Word", ["start", "end", "value"]) -class ExtractorPdfTextLayer(object): +class PdfminerExtractor(object): """ Class extarcts text with style from pdf with help pdfminer.six """ @@ -68,15 +69,19 @@ def __handle_page(self, page: PDFPage, page_number: int, path: str, is_one_colum image_page = self.__get_image(path=path, page_num=page_number) image_height, image_width, *_ = image_page.shape - height = page.mediabox[3] - width = page.mediabox[2] + height = int(page.mediabox[3]) + width = int(page.mediabox[2]) if height > 0 and width > 0: - k_w, k_h = image_width / width, image_height / height + k_w, k_h = image_width / page.mediabox[2], image_height / page.mediabox[3] page_broken = False else: page_broken = True k_w, k_h = None, None - # 1. extract only textline object + + if self.config.get("debug_mode", False): + self.__debug_extract_layout(image_page, layout, page_number, k_w, k_h, page, width, height) + + # 1. extract textline objects and image (as LTImage) images = [] layout_objects = [lobj for lobj in layout] lobjs_textline = [] @@ -87,19 +92,20 @@ def __handle_page(self, page: PDFPage, page_number: int, path: str, is_one_colum lobjs_textline.extend(lines) elif isinstance(lobj, LTTextLineHorizontal): lobjs_textline.append(lobj) + elif isinstance(lobj, LTFigure) and not page_broken: attachment = self.__extract_image(directory, height, image_page, k_h, k_w, lobj, page_number) - if attachment is not None: images.append(attachment) bboxes = [] for line_num, lobj in enumerate(lobjs_textline): - bbox = self.get_info_layout_object(lobj, page_num=page_number, line_num=line_num, k_w=k_w, k_h=k_h, height=height) + text_with_bbox = self.get_info_layout_object(lobj, page_num=page_number, line_num=line_num, k_w=k_w, k_h=k_h, height=height, width=width) + if text_with_bbox.bbox.width * text_with_bbox.bbox.height > 0: + bboxes.append(text_with_bbox) - if bbox.bbox.width * bbox.bbox.height > 0: - bboxes.append(bbox) attachments = images if len(images) < 10 else [] + return PageWithBBox(bboxes=bboxes, image=image_page, page_num=page_number, attachments=attachments) def __extract_image(self, @@ -111,14 +117,13 @@ def __extract_image(self, lobj: LTContainer, page_number: int) -> Optional[PdfImageAttachment]: try: - bbox = self._create_bbox(k_h=k_h, k_w=k_w, height=height, lobj=lobj) + bbox = create_bbox(k_h=k_h, k_w=k_w, height=height, lobj=lobj) location = Location(bbox=bbox, page_number=page_number) cropped = image_page[bbox.y_top_left: bbox.y_bottom_right, bbox.x_top_left: bbox.x_bottom_right] uid = f"fig_{uuid.uuid1()}" file_name = f"{uid}.png" path_out = os.path.join(directory, file_name) Image.fromarray(cropped).save(path_out) - image_page[bbox.y_top_left: bbox.y_bottom_right, bbox.x_top_left: bbox.x_bottom_right] = 255 attachment = PdfImageAttachment(original_name=file_name, tmp_file_path=path_out, need_content_analysis=False, uid=uid, location=location) except Exception as ex: self.logger.error(ex) @@ -144,175 +149,157 @@ def __get_interpreter(self, is_one_column_document: bool) -> Tuple[PDFPageAggreg interpreter = PDFPageInterpreter(rsrcmgr, device) return device, interpreter - def __debug_extract_layout(self, image_src: np.ndarray, layout: LTContainer, page_num: int, k_w: float, k_h: float, page: PDFPage) -> None: - """ - Function for debugging of pdfminer.six layout - :param layout: container of layout element - :return: None - """ - tmp_dir = os.path.join(self.config["path_debug"], "pdfminer") - if not os.path.exists(tmp_dir): - os.mkdir(tmp_dir) - - file_text = open(os.path.join(tmp_dir, f"text_{page_num}.txt"), "wt") - - # 1. extract layout objects - lobjs = [lobj for lobj in layout] - lobjs_textline = [] - lobjs_box = [] - lobjs_words = [] - lobjs_figures = [] - lobjs_images = [] - lobjs_curves = [] - - for lobj in lobjs: - if isinstance(lobj, LTTextBoxHorizontal): - lobjs_textline.extend(lobj) - elif isinstance(lobj, LTTextLineHorizontal): - lobjs_textline.append(lobj) - elif isinstance(lobj, LTRect): - lobjs_box.append(lobj) - elif isinstance(lobj, LTFigure): - lobjs_figures.append(lobj) - elif isinstance(lobj, LTImage): - lobjs_images.append(lobj) - elif isinstance(lobj, LTCurve): - lobjs_curves.append(lobj) - elif isinstance(lobj, LTTextBox): - lobjs_words.append(lobj) - - # 3. print information - self.__draw_layout_element(image_src, lobjs_textline, file_text, k_w, k_h, page, (0, 255, 0)) - self.__draw_layout_element(image_src, lobjs_words, file_text, k_w, k_h, page, (0, 255, 0)) - self.__draw_layout_element(image_src, lobjs_box, file_text, k_w, k_h, page, (0, 0, 255), text="LTRect") - self.__draw_layout_element(image_src, lobjs_figures, file_text, k_w, k_h, page, (255, 0, 0), text="LTFigure") - self.__draw_layout_element(image_src, lobjs_images, file_text, k_w, k_h, page, (0, 255, 255), text="LTImage") - self.__draw_layout_element(image_src, lobjs_curves, file_text, k_w, k_h, page, (0, 255, 255), text="LTCurve") - - cv2.imwrite(os.path.join(tmp_dir, f"img_page_{page_num}.png"), image_src) - file_text.close() - - def __draw_layout_element(self, - image_src: np.ndarray, - lobjs: List, - file: IO, - k_w: float, - k_h: float, - page: PDFPage, - color: Tuple[int, int, int], - text: Optional[str] = None) -> None: - for lobj in lobjs: - # converting coordinate from pdf format into image - box_lobj = ExtractorPdfTextLayer.convert_coordinates_pdf_to_image(lobj, k_w, k_h, page.mediabox[3]) - - cv2.rectangle(image_src, (box_lobj.x_top_left, box_lobj.y_top_left), (box_lobj.x_bottom_right, box_lobj.y_bottom_right), color) - - if text is not None: - cv2.putText(image_src, text, (box_lobj.x_top_left, box_lobj.y_top_left), cv2.FONT_HERSHEY_SIMPLEX, 1, color) - else: - file.write(lobj.get_text()) - - @staticmethod - def convert_coordinates_pdf_to_image(lobj: LTContainer, k_w: float, k_h: float, height_page: int) -> BBox: - x0_new = int(lobj.x0 * k_w) - x1_new = int(lobj.x1 * k_w) - y0_new = int((height_page - lobj.y1) * k_h) - y1_new = int((height_page - lobj.y0) * k_h) - - return BBox(x0_new, y0_new, x1_new - x0_new, y1_new - y0_new) - - def get_info_layout_object(self, lobj: LTContainer, page_num: int, line_num: int, k_w: float, k_h: float, height: int) -> TextWithBBox: + def get_info_layout_object(self, + lobj: LTContainer, + page_num: int, + line_num: int, + k_w: float, + k_h: float, + height: int, + width: int) -> TextWithBBox: # 1 - converting coordinate from pdf format into image - bbox = self._create_bbox(height, k_h, k_w, lobj) + bbox = create_bbox(height, k_h, k_w, lobj) # 2 - extract text and text annotations from current object - text, text_anns = self._get_style_and_text_from_layout_object(lobj) - return TextWithBBox(bbox=bbox, page_num=page_num, text=text, line_num=line_num, annotations=text_anns) - - def _create_bbox(self, height: int, k_h: float, k_w: float, lobj: LTContainer) -> BBox: - curr_box_line = ExtractorPdfTextLayer.convert_coordinates_pdf_to_image(lobj, k_w, k_h, height) - bbox = BBox.from_two_points((curr_box_line.x_top_left, curr_box_line.y_top_left), (curr_box_line.x_bottom_right, curr_box_line.y_bottom_right)) - return bbox - - def _get_style_and_text_from_layout_object(self, lobj: LTContainer) -> [str, List[Annotation]]: - + text = "" + annotations = [] if isinstance(lobj, LTTextLineHorizontal): # cleaning text from (cid: *) - text = self._cleaning_text_from_hieroglyphics(lobj.get_text()) - # get line's style - anns = self._get_line_style(lobj) + text = cleaning_text_from_hieroglyphics(lobj.get_text()) + # get line's annotations + annotations = self.__get_line_annotations(lobj, k_w, k_h, height, width) - return text, anns - else: - return "", None + return TextWithBBox(bbox=bbox, page_num=page_num, text=text, line_num=line_num, annotations=annotations) - def _get_line_style(self, lobj: LTTextLineHorizontal) -> List[Annotation]: - # 1 - prepare data for groupby name + def __get_line_annotations(self, lobj: LTTextLineHorizontal, k_w: float, k_h: float, height: int, width: int) -> List[Annotation]: + # 1 - prepare data for group by name chars_with_style = [] rand_weight = self._get_new_weight() prev_style = "" + for lobj_char in lobj: if isinstance(lobj_char, LTChar) or isinstance(lobj_char, LTAnno): + # get styles if len(chars_with_style) > 0: # check next char different from previously then we fresh rand_weight prev_style, prev_size = chars_with_style[-1].split("_rand_") - if isinstance(lobj_char, LTChar): + + if isinstance(lobj_char, LTChar) and lobj_char.get_text() not in (" ", "\n", "\t"): curr_style = f"{lobj_char.fontname}_{round(lobj_char.size, 0)}" if curr_style != prev_style: rand_weight = self._get_new_weight() chars_with_style.append(f"{curr_style}_rand_{rand_weight}") - elif isinstance(lobj_char, LTAnno) and lobj_char.get_text() in (" ", "\n") and len(chars_with_style) > 0: - # check on the space or \n (in pdfminer is type LTAnno) + elif lobj_char.get_text() in (" ", "\n", "\t") and len(chars_with_style) > 0: + # check on the space or \n # duplicated previous style chars_with_style.append(chars_with_style[-1]) - styles = [] - - # 2 - extract diapasons from the style char array (chars_with_style) - pointer_into_string = 0 + annotations = self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width) + # 3 - extract range from chars_with_style array + char_pointer = 0 for key, group in itertools.groupby(chars_with_style, lambda x: x): count_chars = len(list(group)) - styles.extend(self.__parse_style_string(key, pointer_into_string, pointer_into_string + count_chars - 1)) - pointer_into_string += count_chars - - return styles + annotations.extend(self.__parse_style_string(key, char_pointer, char_pointer + count_chars - 1)) + char_pointer += count_chars - def _cleaning_text_from_hieroglyphics(self, text_str: str) -> str: - """ - replace all cid-codecs into ascii symbols. cid-encoding - hieroglyphic fonts - :param text_str: text - :return: text wo cids-chars - """ - return re.sub(r"\(cid:(\d)*\)", self.cid_recognized, text_str) + return annotations - def cid_recognized(self, m: Match) -> str: - v = m.group(0) - v = v.strip("(") - v = v.strip(")") - ascii_num = v.split(":")[-1] - ascii_num = int(ascii_num) - text_val = chr(ascii_num) - - return text_val + def __extract_words_bbox_annotation(self, lobj: LTTextContainer, k_w: float, k_h: float, height: int, width: int) -> List[Annotation]: + words: List[WordObj] = [] + word: WordObj = WordObj(start=0, end=0, value=LTTextContainer()) + if isinstance(lobj, LTTextLineHorizontal): + lobj = [lobj] + + for text_line in lobj: + for item, lobj_char in enumerate(text_line): + if isinstance(lobj_char, LTChar) and lobj_char.get_text() not in (" ", "\n", "\t"): + word = word._replace(end=word.end + 1) + word.value.add(lobj_char) + elif lobj_char.get_text() in (" ", "\n", "\t"): + if word.value._objs: + words.append(word) + word = WordObj(start=item + 1, end=item + 1, value=LTTextContainer()) + + annotations = [BBoxAnnotation(start=word.start, + end=word.end, + value=create_bbox(height=height, k_h=k_h, k_w=k_w, lobj=word.value), + page_width=width, + page_height=height) for word in words] + return annotations def _get_new_weight(self) -> str: return binascii.hexlify(os.urandom(8)).decode("ascii") def __parse_style_string(self, chars_with_meta: str, begin: int, end: int) -> List[Annotation]: # style parsing - line_anns = [] + annotations = [] prev_style, _ = chars_with_meta.split("_rand_") font, size, *_ = prev_style.split("_") fontname_wo_rand = font.split("+")[-1] styles = fontname_wo_rand.split("-")[-1] + annotations.append(StyleAnnotation(begin, end, value=fontname_wo_rand)) + if "Bold" in styles: - line_anns.append(BoldAnnotation(begin, end, value="True")) + annotations.append(BoldAnnotation(begin, end, value="True")) if "Italic" in styles: - line_anns.append(ItalicAnnotation(begin, end, value="True")) - line_anns.append(StyleAnnotation(begin, end, value=fontname_wo_rand)) + annotations.append(ItalicAnnotation(begin, end, value="True")) + if size.replace(".", "", 1).isnumeric(): - line_anns.append(SizeAnnotation(begin, end, value=size)) + annotations.append(SizeAnnotation(begin, end, value=size)) + + return annotations + + def __debug_extract_layout(self, image_src: np.ndarray, layout: LTContainer, page_num: int, k_w: float, k_h: float, page: PDFPage, + width: int, height: int) -> None: + """ + Function for debugging of pdfminer.six layout + :param layout: container of layout element + :return: None + """ + tmp_dir = os.path.join(self.config.get("path_debug"), "pdfminer") + os.makedirs(tmp_dir, exist_ok=True) + + file_text = open(os.path.join(tmp_dir, f"text_{page_num}.txt"), "wt") + + # 1. extract layout objects + lobjs = [lobj for lobj in layout] + lobjs_textline = [] + lobjs_box = [] + lobjs_words = [] + lobjs_figures = [] + lobjs_images = [] + lobjs_curves = [] + annotations = [] - return line_anns + for lobj in lobjs: + if isinstance(lobj, LTTextBoxHorizontal): + annotations.extend(self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width)) + lobjs_textline.extend(lobj) + elif isinstance(lobj, LTTextLineHorizontal): + annotations.extend(self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width)) + lobjs_textline.append(lobj) + elif isinstance(lobj, LTRect): + lobjs_box.append(lobj) + elif isinstance(lobj, LTFigure): + lobjs_figures.append(lobj) + elif isinstance(lobj, LTImage): + lobjs_images.append(lobj) + elif isinstance(lobj, LTCurve): + lobjs_curves.append(lobj) + elif isinstance(lobj, LTTextBox): + lobjs_words.append(lobj) + # 3. print information + draw_annotation(image_src, annotations) + """ + Call for debugging other LT elements: + self.__draw_layout_element(image_src, lobjs_textline, file_text, k_w, k_h, page, (0, 255, 0)) + self.__draw_layout_element(image_src, lobjs_words, file_text, k_w, k_h, page, (0, 255, 0)) + self.__draw_layout_element(image_src, lobjs_box, file_text, k_w, k_h, page, (0, 0, 255), text="LTRect") + self.__draw_layout_element(image_src, lobjs_figures, file_text, k_w, k_h, page, (255, 0, 0), text="LTFigure") + self.__draw_layout_element(image_src, lobjs_images, file_text, k_w, k_h, page, (0, 255, 255), text="LTImage") + self.__draw_layout_element(image_src, lobjs_curves, file_text, k_w, k_h, page, (0, 255, 255), text="LTCurve")''' + """ + cv2.imwrite(os.path.join(tmp_dir, f"img_page_{page_num}.png"), image_src) + file_text.close() diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_utils.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_utils.py new file mode 100644 index 00000000..cc10c1af --- /dev/null +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_utils.py @@ -0,0 +1,73 @@ +import json +import re +from typing import IO, List, Match, Optional, Tuple + +import cv2 +import numpy as np +from pdfminer.layout import LTContainer +from pdfminer.pdfpage import PDFPage + +from dedoc.data_structures.bbox import BBox +from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation + + +def draw_layout_element(image_src: np.ndarray, + lobjs: List, + file: IO, + k_w: float, + k_h: float, + page: PDFPage, + color: Tuple[int, int, int], + text: Optional[str] = None) -> None: + for lobj in lobjs: + # converting coordinate from pdf format into image + box_lobj = convert_coordinates_pdf_to_image(lobj, k_w, k_h, int(page.mediabox[3])) + + cv2.rectangle(image_src, (box_lobj.x_top_left, box_lobj.y_top_left), (box_lobj.x_bottom_right, box_lobj.y_bottom_right), color) + + if text is not None: + cv2.putText(image_src, text, (box_lobj.x_top_left, box_lobj.y_top_left), cv2.FONT_HERSHEY_SIMPLEX, 1, color) + else: + file.write(lobj.get_text()) + + +def draw_annotation(image: np.ndarray, annotations: List[BBoxAnnotation]) -> None: + for ann in annotations: + bbox = json.loads(ann.value) + p1 = (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"])) + p2 = (int((bbox["x_top_left"] + bbox["width"]) * bbox["page_width"]), int((bbox["y_top_left"] + bbox["height"]) * bbox["page_height"])) + cv2.rectangle(image, p1, p2, (0, 255, 0)) + + +def convert_coordinates_pdf_to_image(lobj: LTContainer, k_w: float, k_h: float, height_page: int) -> BBox: + x0 = int(lobj.x0 * k_w) + x1 = int(lobj.x1 * k_w) + y0 = int((height_page - lobj.y1) * k_h) + y1 = int((height_page - lobj.y0) * k_h) + + return BBox(x0, y0, x1 - x0, y1 - y0) + + +def create_bbox(height: int, k_h: float, k_w: float, lobj: LTContainer) -> BBox: + curr_box_line = convert_coordinates_pdf_to_image(lobj, k_w, k_h, height) + bbox = BBox.from_two_points((curr_box_line.x_top_left, curr_box_line.y_top_left), (curr_box_line.x_bottom_right, curr_box_line.y_bottom_right)) + return bbox + + +def cleaning_text_from_hieroglyphics(text_str: str) -> str: + """ + replace all cid-codecs into ascii symbols. cid-encoding - hieroglyphic fonts + :param text_str: text + :return: text wo cids-chars + """ + return re.sub(r"\(cid:(\d)*\)", cid_to_ascii_text, text_str) + + +def cid_to_ascii_text(m: Match) -> str: + v = m.group(0) + v = v.strip("(").strip(")") + ascii_num = v.split(":")[-1] + ascii_num = int(ascii_num) + text_value = chr(ascii_num) + + return text_value diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar index 7899e232..b3d5eae8 100644 Binary files a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar differ diff --git a/dedoc/scripts/benchmark_tl_correctness.py b/dedoc/scripts/benchmark_tl_correctness.py index b51fdc6e..ae379c78 100644 --- a/dedoc/scripts/benchmark_tl_correctness.py +++ b/dedoc/scripts/benchmark_tl_correctness.py @@ -5,7 +5,7 @@ import requests import wget -from config import get_config +from dedoc.config import get_config from tqdm import tqdm from dedoc.utils.utils import send_file diff --git a/dedoc/scripts/calc_tesseract_benchmarks.py b/dedoc/scripts/calc_tesseract_benchmarks.py index 0db13299..69f569c7 100644 --- a/dedoc/scripts/calc_tesseract_benchmarks.py +++ b/dedoc/scripts/calc_tesseract_benchmarks.py @@ -1,7 +1,5 @@ -import argparse import os import re -import shutil import zipfile from tempfile import TemporaryDirectory from typing import Dict, List @@ -9,17 +7,15 @@ import cv2 import numpy as np import pytesseract +import wget from texttable import Texttable -parser = argparse.ArgumentParser() -parser.add_argument("--input_path", "-i", type=str, default="../../resources/benchmarks/data_tesseract_benchmarks.zip") -parser.add_argument("--output_path", "-o", type=str, default="../../resources/benchmarks/") -parser.add_argument("--log_path", "-l", type=str, default="/tmp/dedoc/benchamarks/tesseract/") +from dedoc.config import get_config def _call_tesseract(image: np.ndarray, language: str, psm: int = 3) -> str: - config = "--psm {}".format(psm) - text = pytesseract.image_to_string(image, lang=language, output_type=pytesseract.Output.DICT, config=config)['text'] + config = f"--psm {psm}" + text = pytesseract.image_to_string(image, lang=language, output_type=pytesseract.Output.DICT, config=config)["text"] return text @@ -53,20 +49,14 @@ def _update_statistics_by_dataset(statistics: Dict, dataset: str, accuracy_path: matched = [line for line in lines if "Accuracy After Correction" in line] if not matched: matched = [line for line in lines if "Accuracy\n" in line] - acc_percent = re.findall(r'\d+\.\d+', matched[0])[0][:-1] + acc_percent = re.findall(r"\d+\.\d+", matched[0])[0][:-1] statistic["Accuracy"].append(float(acc_percent)) statistic["Amount of words"].append(word_cnt) - statistic["ASCII_Spacing_Characters"] = _update_statistics_by_symbol_kind(statistic["ASCII_Spacing_Characters"], - "ASCII Spacing Characters", - lines) - statistic["ASCII_Special_Symbols"] = _update_statistics_by_symbol_kind(statistic["ASCII_Special_Symbols"], - "ASCII Special Symbols", - lines) + statistic["ASCII_Spacing_Characters"] = _update_statistics_by_symbol_kind(statistic["ASCII_Spacing_Characters"], "ASCII Spacing Characters", lines) + statistic["ASCII_Special_Symbols"] = _update_statistics_by_symbol_kind(statistic["ASCII_Special_Symbols"], "ASCII Special Symbols", lines) statistic["ASCII_Digits"] = _update_statistics_by_symbol_kind(statistic["ASCII_Digits"], "ASCII Digits", lines) - statistic["ASCII_Spacing_Characters"] = _update_statistics_by_symbol_kind(statistic["ASCII_Spacing_Characters"], - "ASCII Spacing Characters", - lines) + statistic["ASCII_Spacing_Characters"] = _update_statistics_by_symbol_kind(statistic["ASCII_Spacing_Characters"], "ASCII Spacing Characters", lines) statistic["Cyrillic"] = _update_statistics_by_symbol_kind(statistic["Cyrillic"], "Cyrillic", lines) statistics[dataset] = statistic @@ -90,21 +80,27 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: if __name__ == "__main__": - args = parser.parse_args() + base_zip = "data_tesseract_benchmarks" + output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")) + cache_dir = os.path.join(get_config()["intermediate_data_path"], "tesseract_data") + os.makedirs(cache_dir, exist_ok=True) + benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip") + + if not os.path.isfile(benchmark_data_path): + wget.download("https://at.ispras.ru/owncloud/index.php/s/HqKt53BWmR8nCVG/download", benchmark_data_path) + print(f"Benchmark data downloaded to {benchmark_data_path}") + else: + print(f"Use cached benchmark data from {benchmark_data_path}") + assert os.path.isfile(benchmark_data_path) + accs = [["Dataset", "Image name", "--psm", "Amount of words", "Accuracy OCR"]] accs_common = [["Dataset", "ASCII_Spacing_Chars", "ASCII_Special_Symbols", "ASCII_Digits", "ASCII_Uppercase_Chars", "Latin1_Special_Symbols", "Cyrillic", "Amount of words", "AVG Accuracy"]] - base_zip = "data_tesseract_benchmarks" - statistics = {} - if os.path.exists(args.log_path): - shutil.rmtree(args.log_path) - os.makedirs(args.log_path) - - with zipfile.ZipFile(args.input_path, 'r') as arch_file: + with zipfile.ZipFile(benchmark_data_path, "r") as arch_file: names_dirs = [member.filename for member in arch_file.infolist() if member.file_size > 0] - abs_paths_to_files = [name.split('/')[:] for name in names_dirs] + abs_paths_to_files = [name.split("/")[:] for name in names_dirs] datasets = set([paths[1] for paths in abs_paths_to_files]) @@ -114,21 +110,19 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: for img_name in sorted(imgs): base_name, ext = os.path.splitext(img_name) - if ext not in ['.txt', '.png', '.tiff', '.tif', '.jpg']: + if ext not in [".txt", ".png", ".tiff", ".tif", ".jpg"]: continue - gt_path = os.path.join(base_zip, dataset_name, "gts", base_name + ".txt") + gt_path = os.path.join(base_zip, dataset_name, "gts", f"{base_name}.txt") imgs_path = os.path.join(base_zip, dataset_name, "imgs", img_name) - accuracy_path = os.path.join(args.log_path, dataset_name + "_" + base_name + "_accuracy.txt") + accuracy_path = os.path.join(cache_dir, f"{dataset_name}_{base_name}_accuracy.txt") with TemporaryDirectory() as tmpdir: tmp_gt_path = os.path.join(tmpdir, "tmp_gt.txt") tmp_ocr_path = os.path.join(tmpdir, "tmp_ocr.txt") try: - with arch_file.open(gt_path) as gt_file, \ - open(tmp_gt_path, "wb") as tmp_gt_file,\ - open(tmp_ocr_path, "w") as tmp_ocr_file: + with arch_file.open(gt_path) as gt_file, open(tmp_gt_path, "wb") as tmp_gt_file, open(tmp_ocr_path, "w") as tmp_ocr_file: gt_text = gt_file.read().decode("utf-8") word_cnt = len(gt_text.split()) @@ -146,7 +140,8 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: tmp_ocr_file.flush() # calculation accuracy build for Ubuntu from source https://github.com/eddieantonio/ocreval - command = "accuracy {} {} >> {}".format(tmp_gt_path, tmp_ocr_path, accuracy_path) + accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "accuracy")) + command = f"{accuracy_script_path} {tmp_gt_path} {tmp_ocr_path} >> {accuracy_path}" os.system(command) statistics = _update_statistics_by_dataset(statistics, dataset_name, accuracy_path, word_cnt) @@ -154,6 +149,7 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: except Exception as ex: print(ex) + print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`") table_aacuracy_per_image = Texttable() table_aacuracy_per_image.add_rows(accs) @@ -167,13 +163,12 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: accs_common.append(row) table_common.add_rows(accs_common) - with open(os.path.join(args.output_path, "tesseract.benchmark"), "w") as res_file: - res_file.write( - "Tesseract version is {}\nTable 1 - Accuracy for each file\n".format(pytesseract.get_tesseract_version())) + with open(os.path.join(output_dir, "tesseract_benchmark.txt"), "w") as res_file: + res_file.write(f"Tesseract version is {pytesseract.get_tesseract_version()}\nTable 1 - Accuracy for each file\n") res_file.write(table_aacuracy_per_image.draw()) - res_file.write("\n\nTable 2 - AVG by each type of symbols:\n") + res_file.write(f"\n\nTable 2 - AVG by each type of symbols:\n") res_file.write(table_common.draw()) - print("Tesseract version is {}".format(pytesseract.get_tesseract_version())) + print(f"Tesseract version is {pytesseract.get_tesseract_version()}") print(table_aacuracy_per_image.draw()) print(table_common.draw()) diff --git a/docker-compose.yml b/docker-compose.yml index ec6bab7c..3cfe4b62 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,7 @@ services: mem_limit: 16G build: context: . - dockerfile: docker/Dockerfile + dockerfile: Dockerfile restart: always tty: true ports: @@ -19,7 +19,7 @@ services: - dedoc build: context: . - dockerfile: docker/Dockerfile + dockerfile: Dockerfile tty: true environment: DOC_READER_HOST: "dedoc" diff --git a/docker/DockerfileBaseimg b/docker/DockerfileBaseimg deleted file mode 100644 index 3f05f37f..00000000 --- a/docker/DockerfileBaseimg +++ /dev/null @@ -1,59 +0,0 @@ -ARG REPOSITORY="docker.io" -FROM ubuntu:bionic-20210118 - - -RUN apt-get update && apt-get install -y software-properties-common locales && locale-gen en_US.UTF-8 -RUN locale-gen ru_RU.UTF-8 -ENV TZ=Europe/Moscow -RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone -ENV LANG ru_RU.utf8 -ENV LANGUAGE ru_RU:ru -ENV LC_ALL ru_RU.UTF-8 - -# --------------------------------------------------PYTHON INSTALLATION------------------------------------------------- -RUN apt-get update && \ - apt-get -y install curl git unzip wget build-essential gcc-multilib g++-multilib git clang zlib1g-dev \ - pkg-config libglib2.0-dev python3 python3-pip libtool binutils-dev -RUN curl https://repo.anaconda.com/archive/Anaconda3-2022.10-Linux-x86_64.sh --output anaconda.sh -RUN bash anaconda.sh -b -p /anaconda3 -ENV PATH=/anaconda3/bin:$PATH -RUN conda init bash -RUN bash - -RUN apt-get install -y libreoffice - -# -----------------------------------------------TESSERACT INSTALLATION------------------------------------------------- -# the commands below are used to install tesseract - -RUN add-apt-repository -y ppa:alex-p/tesseract-ocr-devel \ - && apt update --allow-releaseinfo-change \ - && apt-get install -y djvulibre-bin unrtf poppler-utils pstotext tesseract-ocr libjpeg-dev swig \ - libtesseract-dev libleptonica-dev unrar python-poppler automake ca-certificates g++ libtool libleptonica-dev \ - make pkg-config libpango1.0-dev - -RUN git clone --depth 1 --branch 5.0.0-beta-20210916 https://github.com/tesseract-ocr/tesseract/ -RUN cd tesseract && ./autogen.sh && ./configure && make && make install && ldconfig - -RUN apt update --allow-releaseinfo-change \ - && apt-get install -y tesseract-ocr-rus build-essential libcairo2 \ - libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 libffi-dev shared-mime-info -ENV TESSDATA_PREFIX /usr/share/tesseract-ocr/5/tessdata/ -ENV PATH=/tesseract:$PATH - -# for reading j2k -ENV OPENCV_IO_ENABLE_JASPER "true" - -# --------------------------------------------------DOCTR INSTALLATION-------------------------------------------------- -# ATTENTION: don't change an order of pip's package install here, otherwise you get conflicts -# RUN pip install setuptools==60.10.0 cffi==1.15.0 -# RUN pip install python-doctr==0.5.1 -# We decided to stop using Doctr. If you need it, uncomment two lines above and comment one line below to make docker image with Doctr. - -RUN pip install pyclipper==1.3.0.post4 shapely==2.0.1 Pillow==9.2.0 - -# ----------------------------------------SECURE TORCH & TORCHVISION INSTALLATION--------------------------------------- -RUN wget -O torch-1.11.0a0+git1911a63-cp39-cp39-linux_x86_64.whl https://at.ispras.ru/owncloud/index.php/s/gGZa46pboBlVZ7t/download -RUN pip install torch-1.11.0a0+git1911a63-cp39-cp39-linux_x86_64.whl -RUN wget -O torchvision-0.12.0a0+9b5a3fe-cp39-cp39-linux_x86_64.whl https://at.ispras.ru/owncloud/index.php/s/doFEAhID6OhNCkp/download -RUN pip install torchvision-0.12.0a0+9b5a3fe-cp39-cp39-linux_x86_64.whl -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/anaconda3/lib/ diff --git a/docker/DockerfilePytorch b/docker/DockerfilePytorch deleted file mode 100644 index 8e0ba6f7..00000000 --- a/docker/DockerfilePytorch +++ /dev/null @@ -1,45 +0,0 @@ -ARG REPOSITORY="docker.io" -FROM ubuntu:bionic-20210118 - -RUN apt-get update && \ - apt-get install -y curl wget git vim clang python3 python3-pip \ - build-essential gcc-multilib g++-multilib unzip - -RUN wget https://github.com/ninja-build/ninja/releases/download/v1.10.2/ninja-linux.zip \ - && unzip ninja-linux.zip && mv ninja /usr/bin && rm ninja-linux.zip - -RUN curl -L -O https://github.com/Kitware/CMake/releases/download/v3.22.1/cmake-3.22.1-linux-x86_64.sh && \ - mkdir /cmake && \ - bash cmake-3.22.1-linux-x86_64.sh --prefix=/cmake --exclude-subdir --skip-license && \ - ln -s /cmake/bin/cmake /bin/cmake && \ - rm cmake-3.22.1-linux-x86_64.sh - -# Clone target from GitHub. -RUN pip3 install --upgrade pip && \ - pip3 install scikit-build astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions - -ADD pytorch.tar.gz / # put the file pytorch.tar.gz to the root of the repository - -WORKDIR / -RUN curl https://repo.anaconda.com/archive/Anaconda3-2022.10-Linux-x86_64.sh --output anaconda.sh -RUN bash anaconda.sh -b -p /anaconda3 -ENV PATH=/anaconda3/bin:$PATH -RUN conda init bash -RUN bash - -RUN conda install astunparse numpy ninja pyyaml setuptools cmake cffi typing_extensions future six requests dataclasses mkl mkl-include - -WORKDIR /pytorch -ENV CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} -RUN git config --global --add safe.directory '*' -RUN python3 setup.py build -RUN python3 setup.py bdist_wheel -RUN pip3 install dist/torch-1.11.0a0+git1911a63-cp39-cp39-linux_x86_64.whl - -WORKDIR / -RUN git clone https://github.com/pytorch/vision.git torchvision && cd /torchvision && \ - git checkout v0.12.0 -WORKDIR /torchvision -RUN python3 setup.py build -RUN python3 setup.py bdist_wheel -RUN pip3 install dist/torchvision-0.12.0a0+9b5a3fe-cp39-cp39-linux_x86_64.whl diff --git a/docker/README.md b/docker/README.md deleted file mode 100644 index b76b1183..00000000 --- a/docker/README.md +++ /dev/null @@ -1,28 +0,0 @@ - - -# How to change base image version for building dedoc using docker -## Change the DockerfileBaseimg file - -This file is used for building an image with tesseract-ocr, libreoffice, secure pytorch and python tools in order to -reduce time for its building in the main docker/Dockerfile - -## Build the new baseimg image locally - -Run the command below from the project root - -```shell -export VERSION_TAG=$(date '+%Y_%m_%d') -docker build -t dedocproject/baseimg:version_$VERSION_TAG -f docker/DockerfileBaseimg . -``` - -## Push the built image to the remote repository - -The commands below allow to push the image to the [docker-hub](https://hub.docker.com). -You need login and password for this purpose. - -```shell -docker login -u dedocproject -p -docker tag dedocproject/baseimg:version_$VERSION_TAG dedocproject/baseimg:latest -docker push dedocproject/baseimg:version_$VERSION_TAG -docker push dedocproject/baseimg:latest -``` \ No newline at end of file diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 4ce73f53..a300853f 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,18 @@ Changelog ========= +v0.11.1 (2023-08-30) +-------------------- +Release note: `v0.11.1 `_ + +* Add bbox annotations in `PdfTabbyReader`. +* Add bbox annotations for words in `PdfTxtlayerReader`. +* Add an option `plain_text` to the `return_format` parameter. +* Reduce size of the dedoc base image, move dockerfiles to the `separate repository `_. +* Refactor script for tesseract benchmarking. +* Make fixed dedoc dependencies as ranges. +* Add table cell properties in `PdfTabbyReader`. + v0.11.0 (2023-08-22) -------------------- Release note: `v0.11.0 `_ diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index 5fcf9363..ef7f5bf1 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -95,7 +95,7 @@ Api parameters description This type is used for choosing a specific structure constructor after document structure extraction. * - return_format - - json, pretty_json, html, tree + - json, pretty_json, html, plain_text, tree - json - The output format of the result data. The document structure from a structure constructor (see :class:`~dedoc.data_structures.ParsedDocument`) @@ -107,6 +107,8 @@ Api parameters description * **html** -- :class:`~dedoc.data_structures.ParsedDocument` is transformed into html file with styles and headers according to the extracted annotations and structure; + * **plain_text** -- simple textual lines of the document; + * **tree** -- simple document tree representation in html format (useful for structure visualization). * - :cspan:`3` **Attachments handling** diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index c835fbf8..42ae2ea8 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -73,7 +73,7 @@ If you don't need converters, you can skip this step. 2. Install `Tesseract OCR 5` framework. -You can try any tutorial for this purpose or look `here `_ +You can try any tutorial for this purpose or look `here `_ to get the example of Tesseract installing for dedoc container. diff --git a/requirements.txt b/requirements.txt index 07d89808..67043b0f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,42 +1,42 @@ -Cython==0.29.28 -beautifulsoup4==4.10.0 -charset-normalizer==2.0.12 +beautifulsoup4>=4.10.0,<=4.12.2 +charset-normalizer>=2.0.12,<=3.2.0 +Cython>=0.29.28,<=3.0.2 docx==0.2.4 -huggingface-hub==0.14.1 -itsdangerous==2.1.0 +fastapi>=0.77.0,<=0.103.0 +huggingface-hub>=0.14.1,<=0.16.4 +imutils==0.5.4 +itsdangerous>=2.1.0,<=2.1.2 +numpy>=1.22.0,<=1.22.3 olefile~=0.46 -python-docx==0.8.11 -python-pptx==0.6.21 -requests==2.29.0 -ujson==5.3.0 -orjson==3.8.11 -xlrd==1.2.0 -numpy==1.22.3 -PyPDF2==1.26.0 -python-Levenshtein==0.12.2 +opencv-python>=4.5.5.64,<4.6.0 +orjson>=3.8.11,<=3.9.5 +pandas>=1.4.1,<=1.9.0 +pdf2image==1.10.0 #1.14.0 - there are converting artifacts '№' != '№\n\x0c' pdfminer.six==20211012 +piexif==1.1.3 +pylzma==0.5.0 +PyPDF2==1.27.0 pytesseract==0.3.10 -scikit_learn==1.0.2 -imutils==0.5.4 -opencv-python==4.5.5.64 -pandas==1.4.1 -pdf2image==1.10.0 #1.14.0 - there are converting artifacts '№' != '№\n\x0c' +python-docx==0.8.11 +python-Levenshtein==0.12.2 +python-logstash-async>=2.5.0,<=2.7.0 +python-multipart==0.0.6 +python-pptx==0.6.21 rarfile==4.0 -roman==3.3 -scipy==1.8.0 -six==1.16.0 -pylzma==0.5.0 -xgboost==1.1.1 -piexif==1.1.3 +requests>=2.22.0 +roman>=3.3,<4.0 +scikit-image>=0.19.3,<=0.21.0 +scikit_learn>=1.0.2,<=1.3.0 +scipy>=1.8.0,<=1.11.2 +six==1.14.0 +starlette>=0.26.1,<0.27.0 texttable==1.6.7 -scikit-image==0.19.3 -fastapi==0.95.1 -uvicorn==0.22.0 -python-logstash-async==2.5.0 -python-multipart==0.0.6 -starlette==0.26.1 -xgbfir==0.3.1 +ujson>=5.4.0,<=5.8.0 +uvicorn>=0.18.0,<=0.23.2 wget==3.2 +xgbfir==0.3.1 +xgboost>=1.1.1,<1.2.0 +xlrd==1.2.0 # TODO remove all flask Werkzeug==2.0.3 Flask==2.0.3 diff --git a/resources/benchmarks/tesseract.benchmark b/resources/benchmarks/tesseract_benchmark.txt similarity index 89% rename from resources/benchmarks/tesseract.benchmark rename to resources/benchmarks/tesseract_benchmark.txt index 864ecaed..6a59d51a 100644 --- a/resources/benchmarks/tesseract.benchmark +++ b/resources/benchmarks/tesseract_benchmark.txt @@ -1,17 +1,16 @@ -Tesseract version is 5.0.0-alpha-20210401-94-ga968 -Table 1 - Accuracy for each file +Tesseract version is 5.0.0 +---------------+---------------------+-------+-----------------+--------------+ | Dataset | Image name | --psm | Amount of words | Accuracy OCR | +===============+=====================+=======+=================+==============+ -| english-words | Kaspersky | 6 | 111 | 99.400 | +| english-words | Kaspersky | 6 | 111 | 99.600 | +---------------+---------------------+-------+-----------------+--------------+ -| english-words | USB | 6 | 4 | 80.900 | +| english-words | USB | 6 | 4 | 85.700 | +---------------+---------------------+-------+-----------------+--------------+ -| english-words | words1 | 6 | 19 | 99.200 | +| english-words | words1 | 6 | 19 | 100 | +---------------+---------------------+-------+-----------------+--------------+ -| english-words | words2 | 6 | 9 | 98.400 | +| english-words | words2 | 6 | 9 | 100 | +---------------+---------------------+-------+-----------------+--------------+ -| english-words | words3 | 6 | 9 | 98.100 | +| english-words | words3 | 6 | 9 | 100 | +---------------+---------------------+-------+-----------------+--------------+ | others | Zaklyuchenie_nevrol | 4 | 525 | 83.800 | | | oga_00 | | | | @@ -19,45 +18,45 @@ Table 1 - Accuracy for each file | others | Zaklyuchenie_nevrol | 4 | 241 | 88.800 | | | oga_01 | | | | +---------------+---------------------+-------+-----------------+--------------+ -| others | napalm_doc_2_2_6 | 4 | 124 | 86 | +| others | napalm_doc_2_2_6 | 4 | 124 | 85.500 | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 695 | 99.700 | +| tz-npa | 1.620e+14 | 4 | 695 | 99.800 | +---------------+---------------------+-------+-----------------+--------------+ | tz-npa | 1.620e+14 | 4 | 696 | 99.700 | +---------------+---------------------+-------+-----------------+--------------+ | tz-npa | 1.620e+14 | 4 | 699 | 99.800 | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | article_multiline | 4 | 471 | 99.900 | +| tz-npa | article_multiline | 4 | 471 | 100 | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | fstek17_00 | 4 | 192 | 95.200 | +| tz-npa | fstek17_00 | 4 | 192 | 95.300 | +---------------+---------------------+-------+-----------------+--------------+ | tz-npa | fstek17_01 | 4 | 332 | 99.700 | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | law_image | 4 | 182 | 99.500 | +| tz-npa | law_image | 4 | 182 | 99.600 | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | napalm_doc_13_2 | 4 | 243 | 97.500 | +| tz-npa | napalm_doc_13_2 | 4 | 243 | 97.600 | +---------------+---------------------+-------+-----------------+--------------+ | tz-npa | ukaz_prezidenta_1 | 4 | 264 | 99.800 | +---------------+---------------------+-------+-----------------+--------------+ | tz-npa | ukodeksrf_00 | 4 | 287 | 99.900 | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukodeksrf_01 | 4 | 340 | 99.500 | +| tz-npa | ukodeksrf_01 | 4 | 340 | 99.600 | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 146 | 95.600 | +| tz-npa | with_applications_0 | 4 | 146 | 95.700 | | | 0 | | | | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 276 | 99.500 | +| tz-npa | with_applications_0 | 4 | 276 | 99.600 | | | 1 | | | | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 165 | 98.700 | +| tz-npa | with_applications_0 | 4 | 165 | 98.800 | | | 2 | | | | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 90 | 99.400 | +| tz-npa | with_applications_0 | 4 | 90 | 99.600 | | | 3 | | | | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_00 | 4 | 78 | 97.700 | +| tz-npa | ТЗ_00 | 4 | 78 | 97.900 | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_01 | 4 | 296 | 98.200 | +| tz-npa | ТЗ_01 | 4 | 296 | 98.300 | +---------------+---------------------+-------+-----------------+--------------+ | tz-npa | ТЗ_02 | 4 | 309 | 98.800 | +---------------+---------------------+-------+-----------------+--------------+ @@ -69,27 +68,25 @@ Table 1 - Accuracy for each file +---------------+---------------------+-------+-----------------+--------------+ | tz-npa | ТЗ_06 | 4 | 219 | 93.500 | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_07 | 4 | 233 | 98.500 | +| tz-npa | ТЗ_07 | 4 | 233 | 98.600 | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_08 | 4 | 284 | 97.100 | +| tz-npa | ТЗ_08 | 4 | 284 | 97.200 | +---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_09 | 4 | 154 | 97.400 | +| tz-npa | ТЗ_09 | 4 | 154 | 97.500 | +---------------+---------------------+-------+-----------------+--------------+ - -Table 2 - AVG by each type of symbols: +--------+--------+--------+--------+--------+--------+--------+-------+-------+ | Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | | t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | | | g_Char | l_Symb | | ase_Ch | al_Sym | | words | cy | | | s | ols | | ars | bols | | | | +========+========+========+========+========+========+========+=======+=======+ -| englis | 89.280 | 99.333 | 100 | 0 | 0 | 94.540 | 152 | 95.20 | +| englis | 100 | 99.333 | 100 | 0 | 0 | 94.540 | 152 | 97.06 | | h- | | | | | | | | 0 | | words | | | | | | | | | +--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| others | 90.567 | 77.400 | 89.533 | 0 | 0 | 86.433 | 890 | 86.20 | -| | | | | | | | | 0 | +| others | 90.967 | 79.867 | 89.533 | 0 | 0 | 86.133 | 890 | 86.03 | +| | | | | | | | | 3 | +--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| tz-npa | 98.824 | 91.064 | 92.076 | 0 | 0 | 99.480 | 7483 | 98.32 | -| | | | | | | | | 8 | +| tz-npa | 99.268 | 91.064 | 92.076 | 0 | 0 | 99.480 | 7483 | 98.39 | +| | | | | | | | | 6 | +--------+--------+--------+--------+--------+--------+--------+-------+-------+ \ No newline at end of file diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py index 67d66b5a..ad02ad61 100644 --- a/tests/api_tests/test_api_format_pdf_tabby_reader.py +++ b/tests/api_tests/test_api_format_pdf_tabby_reader.py @@ -2,6 +2,9 @@ import unittest from typing import List +from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation +from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation +from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -267,3 +270,30 @@ def test_pdf_with_tables(self) -> None: node = self._get_by_tree_path(tree, "0.4.2") self.assertEqual("list_item", node["metadata"]["paragraph_type"]) self.assertEqual("3. В соответствии с полученной", node["text"].strip()[:30]) + + def test_pdf_annotations(self) -> None: + file_name = "Document635.pdf" + result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby")) + content = result["content"]["structure"]["subparagraphs"] + annotations = content[0]["annotations"] + annotation_names = {annotation["name"] for annotation in annotations} + self.assertIn(BoldAnnotation.name, annotation_names) + self.assertIn(SpacingAnnotation.name, annotation_names) + self.assertIn(BBoxAnnotation.name, annotation_names) + + def test_tables_with_merged_cells(self) -> None: + file_name = "big_table_with_merged_cells.pdf" + result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby")) + table = result["content"]["tables"][0] + cell_properties = table["metadata"]["cell_properties"] + + hidden_cells_big_table_with_colspan = [[(1, 0), 10], [(5, 5), 5]] + + for (i, j), k in hidden_cells_big_table_with_colspan: + self.assertFalse(cell_properties[i][j]["invisible"]) + self.assertEqual(cell_properties[i][j]["rowspan"], 1) + self.assertEqual(cell_properties[i][j]["colspan"], k) + + self.assertFalse(cell_properties[3][0]["invisible"]) + self.assertEqual(cell_properties[3][0]["rowspan"], 3) + self.assertEqual(cell_properties[3][0]["colspan"], 4) diff --git a/tests/api_tests/test_api_format_pdf_with_text.py b/tests/api_tests/test_api_format_pdf_with_text.py index 407d97af..1f5c51e1 100644 --- a/tests/api_tests/test_api_format_pdf_with_text.py +++ b/tests/api_tests/test_api_format_pdf_with_text.py @@ -1,5 +1,4 @@ import os -import unittest from typing import List from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -13,24 +12,30 @@ def _get_abs_path(self, file_name: str) -> str: def __filter_by_name(self, annotations: List[dict], name: str) -> List[dict]: return [annotation for annotation in annotations if annotation["name"] == name] - @unittest.skip("TODO") + def __get_annotation_names(self, annotations: List[dict]) -> List[str]: + return [annotation["name"] for annotation in annotations] + def test_pdf_with_text_style(self) -> None: file_name = "diff_styles.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="true", document_type="", need_pdf_table_analysis="false")) tree = result["content"]["structure"] self._check_tree_sanity(tree) - node = self._get_by_tree_path(tree, "0.0") + node = self._get_by_tree_path(tree, "0.0.0") self.assertEqual("1.1TimesNewRomanItalicBold20\n", node["text"]) self.assertIn({"start": 0, "end": 28, "name": "size", "value": "20.0"}, node["annotations"]) + annotation_names = self.__get_annotation_names(node["annotations"]) + self.assertListEqual(["bounding box", "style", "size", "color_annotation", "spacing"], annotation_names) - node = self._get_by_tree_path(tree, "0.1") + node = self._get_by_tree_path(tree, "0.0.0.0") annotations_size = self.__filter_by_name(name="size", annotations=node["annotations"]) self.assertIn({"start": 0, "end": 26, "name": "size", "value": "16.0"}, annotations_size) - self.assertEqual(len(node["annotations"]), 5) + self.assertEqual(len(node["annotations"]), 6) + annotation_names = self.__get_annotation_names(node["annotations"]) self.assertEqual("Different styles(Arial16):\n", node["text"]) + self.assertListEqual(["bounding box", "bounding box", "style", "size", "color_annotation", "spacing"], annotation_names) - node = self._get_by_tree_path(tree, "0.2.2") + node = self._get_by_tree_path(tree, "0.1.2") self.assertEqual("3. TimesNewRomanItalic14, Calibri18, Tahoma16\n", node["text"]) self.assertEqual("3. ", node["text"][0:3]) self.assertIn({"start": 0, "end": 36, "name": "style", "value": "TimesNewRomanPSMT"}, node["annotations"]) @@ -44,7 +49,14 @@ def test_pdf_with_text_style(self) -> None: self.assertEqual("Tahoma16\n", node["text"][37:46]) self.assertIn({"start": 37, "end": 45, "value": "Tahoma", "name": "style"}, node["annotations"]) self.assertIn({"start": 37, "end": 45, "name": "size", "value": "16.0"}, node["annotations"]) - self.assertEqual(9, len(node["annotations"])) + self.assertEqual(12, len(node["annotations"])) + + word_bboxes = self.__filter_by_name(node["annotations"], "bounding box") + self.assertEqual(len(word_bboxes), 4) + self.assertEqual("3.", node["text"][word_bboxes[0]["start"]:word_bboxes[0]["end"]]) + self.assertEqual("TimesNewRomanItalic14,", node["text"][word_bboxes[1]["start"]:word_bboxes[1]["end"]]) + self.assertEqual("Calibri18,", node["text"][word_bboxes[2]["start"]:word_bboxes[2]["end"]]) + self.assertEqual("Tahoma16", node["text"][word_bboxes[3]["start"]:word_bboxes[3]["end"]]) def test_pdf_with_text_style_2(self) -> None: file_name = "2-column-state.pdf" @@ -65,7 +77,6 @@ def test_pdf_with_text_style_2(self) -> None: self.assertIn("Pere Manils, Abdelberi Chaabane, Stevens Le Blond,", self._get_by_tree_path(tree, "0.1")["text"]) - @unittest.skip("TODO") def test_pdf_with_2_columns_text(self) -> None: file_name = "2-column-state.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="true", document_type="", need_pdf_table_analysis="false")) @@ -75,17 +86,17 @@ def test_pdf_with_2_columns_text(self) -> None: self.assertIn("Privacy of users in P2P networks goes far beyond their\n" "current usage and is a fundamental requirement to the adop-\n" "tion of P2P protocols for legal usage. In a climate of cold", - self._get_by_tree_path(tree, "0.5")["text"]) + self._get_by_tree_path(tree, "0.4.1.2")["text"]) - self.assertIn("Keywords", self._get_by_tree_path(tree, "0.6")["text"]) - self.assertIn("Anonymizing Networks, Privacy, Tor, BitTorrent", self._get_by_tree_path(tree, "0.7")["text"]) + self.assertIn("Keywords", self._get_by_tree_path(tree, "0.4.1.3")["text"]) + self.assertIn("Anonymizing Networks, Privacy, Tor, BitTorrent", self._get_by_tree_path(tree, "0.4.1.4")["text"]) - self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.8.0.0")["text"]) + self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.5.0.0")["text"]) self.assertIn("The Tor network was designed to provide freedom\n" "of speech by guaranteeing anonymous communications.\n" "Whereas the cryptographic foundations of Tor, based on\n" "onion-routing [3, 9, 22, 24], are known to be robust, identity", - self._get_by_tree_path(tree, "0.8.0.1")["text"]) + self._get_by_tree_path(tree, "0.5.0.1")["text"]) def test_pdf_with_2_columns_text_2(self) -> None: file_name = "liters_state.pdf" diff --git a/tests/data/pdf_with_text_layer/big_table_with_merged_cells.pdf b/tests/data/pdf_with_text_layer/big_table_with_merged_cells.pdf new file mode 100644 index 00000000..c1298ab4 Binary files /dev/null and b/tests/data/pdf_with_text_layer/big_table_with_merged_cells.pdf differ