diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..ff7c92ca --- /dev/null +++ b/.flake8 @@ -0,0 +1,26 @@ +[flake8] + +max-line-length = 160 +max-complexity = 13 +inline-quotes = " + +application-import-names = dedoc, tests +import-order-style = pycharm + +exclude = + .git, + __pycache__, + .idea, + .github, + *__init__.py, + resources, + dedoc/scripts, + examples, + docs, + venv, + build, + dedoc.egg-info + +# ANN101 - type annotations for self +ignore = + ANN101 diff --git a/.github/workflows/test_on_push.yaml b/.github/workflows/test_on_push.yaml index 9b2f1917..de74c6df 100644 --- a/.github/workflows/test_on_push.yaml +++ b/.github/workflows/test_on_push.yaml @@ -29,11 +29,11 @@ jobs: uses: actions/setup-python@v2 with: python-version: '3.8' - - name: Install dependencies + - name: Run lint run: | python3 -m pip install --upgrade pip - pip3 install pycodestyle==2.7.0 flake8==3.9.2 flake8-annotations==2.6.2 pyflakes==2.3.1 + pip3 install .[lint] + flake8 . - name: Run tests run: | - python3 -m unittest -v -f tests/test_style.py test="true" docker-compose up --build --exit-code-from test diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..330ca49d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,20 @@ +repos: +- repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + exclude: \.github|.*__init__\.py|resources|dedoc/scripts|examples|docs|venv|build|dedoc\.egg-info + args: + - "--config=.flake8" + additional_dependencies: [ + flake8-absolute-import==1.0.0.1, + flake8-annotations==2.9.1, + flake8-bugbear==23.3.12, + flake8-builtins==2.1.0, + flake8-import-order==0.18.2, + flake8-print==5.0.0, + flake8-quotes==3.3.2, + flake8-use-fstring==1.4, + pycodestyle==2.9.0, + pep8-naming==0.13.3 + ] diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index 463ab2c9..d17bfd6f 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -1,5 +1,5 @@ -# noqa from typing import Any, Optional + from fastapi import Body from pydantic import BaseModel @@ -36,68 +36,68 @@ class QueryParameters(BaseModel): def __init__(self, # type of document structure parsing - document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None), - structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None), - return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None), + document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None), # noqa + structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None), # noqa + return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None), # noqa # attachments handling - with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None), - need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None), - recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None), - return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None), - attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None), + with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None), # noqa + need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None), # noqa + recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None), # noqa + return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None), # noqa + attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None), # noqa # tables handling - insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None), - need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None), - table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None), - orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None), - orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None), + insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None), # noqa + need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None), # noqa + table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None), # noqa + orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None), # noqa + orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None), # noqa # pdf handling - pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None), - language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None), - pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None), - is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None), - document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None), - need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None), - need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None), + pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None), # noqa + language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None), # noqa + pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None), # noqa + is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None), # noqa + document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None), # noqa + need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None), # noqa + need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None), # noqa # other formats handling - delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None), - encoding: Optional[str] = Body(description="a document encoding", default=None), - html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None), - handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None), + delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None), # noqa + encoding: Optional[str] = Body(description="a document encoding", default=None), # noqa + html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None), # noqa + handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None), # noqa - **data: Any) -> None: + **data: Any) -> None: # noqa super().__init__(**data) - self.document_type: str = document_type or "" - self.structure_type: str = structure_type or 'tree' - self.return_format: str = return_format or 'json' - - self.with_attachments: str = with_attachments or 'false' - self.need_content_analysis: str = need_content_analysis or 'false' - self.recursion_deep_attachments: str = recursion_deep_attachments or '10' - self.return_base64: str = return_base64 or 'false' - self.attachments_dir: str = attachments_dir - - self.insert_table: str = insert_table or 'false' - self.need_pdf_table_analysis: str = need_pdf_table_analysis or 'true' - self.table_type: str = table_type or '' - self.orient_analysis_cells: str = orient_analysis_cells or 'false' - self.orient_cell_angle: str = orient_cell_angle or "90" - - self.pdf_with_text_layer: str = pdf_with_text_layer or 'auto_tabby' - self.language: str = language or "rus+eng" - self.pages: str = pages or ':' - self.is_one_column_document: str = is_one_column_document or 'auto' - self.document_orientation: str = document_orientation or "auto" - self.need_header_footer_analysis: str = need_header_footer_analysis or 'false' - self.need_binarization: str = need_binarization or 'false' - - self.delimiter: str = delimiter - self.encoding: str = encoding - self.html_fields: str = html_fields or '' - self.handle_invisible_table: str = handle_invisible_table or 'false' + self.document_type: str = document_type or "" + self.structure_type: str = structure_type or "tree" + self.return_format: str = return_format or "json" + + self.with_attachments: str = with_attachments or "false" + self.need_content_analysis: str = need_content_analysis or "false" + self.recursion_deep_attachments: str = recursion_deep_attachments or "10" + self.return_base64: str = return_base64 or "false" + self.attachments_dir: str = attachments_dir + + self.insert_table: str = insert_table or "false" + self.need_pdf_table_analysis: str = need_pdf_table_analysis or "true" + self.table_type: str = table_type or "" + self.orient_analysis_cells: str = orient_analysis_cells or "false" + self.orient_cell_angle: str = orient_cell_angle or "90" + + self.pdf_with_text_layer: str = pdf_with_text_layer or "auto_tabby" + self.language: str = language or "rus+eng" + self.pages: str = pages or ":" + self.is_one_column_document: str = is_one_column_document or "auto" + self.document_orientation: str = document_orientation or "auto" + self.need_header_footer_analysis: str = need_header_footer_analysis or "false" + self.need_binarization: str = need_binarization or "false" + + self.delimiter: str = delimiter + self.encoding: str = encoding + self.html_fields: str = html_fields or "" + self.handle_invisible_table: str = handle_invisible_table or "false" diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py index 1f289ca7..bd765535 100644 --- a/dedoc/api/api_utils.py +++ b/dedoc/api/api_utils.py @@ -1,4 +1,4 @@ -from typing import Optional, List, Dict, Iterator, Set +from typing import Dict, Iterator, List, Optional, Set from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation @@ -24,7 +24,7 @@ def __prettify_text(text: str) -> Iterator[str]: yield " ".join(res) -def _node2tree(paragraph: 'TreeNode', depth: int, depths: Set[int] = None) -> str: +def _node2tree(paragraph: TreeNode, depth: int, depths: Set[int] = None) -> str: if depths is None: depths = set() @@ -33,31 +33,29 @@ def _node2tree(paragraph: 'TreeNode', depth: int, depths: Set[int] = None) -> st space = "".join(space) node_result = [] - node_result.append(" {} {} ".format( - space, paragraph.metadata.hierarchy_level.line_type + " " + paragraph.node_id)) + node_result.append(f" {space} {paragraph.metadata.hierarchy_level.line_type} {paragraph.node_id} ") for text in __prettify_text(paragraph.text): space = [space_symbol] * 4 * (depth - 1) + 4 * [space_symbol] space = "".join(space) - node_result.append("
{} {}
".format(space, text)) + node_result.append(f"{space} {text}
") if len(paragraph.subparagraphs) > 0: - sub_nodes = "\n".join([_node2tree(sub_node, depth=depth + 1, depths=depths.union({depth})) - for sub_node in paragraph.subparagraphs]) - return """ + sub_nodes = "\n".join([_node2tree(sub_node, depth=depth + 1, depths=depths.union({depth})) for sub_node in paragraph.subparagraphs]) + return f"""- {} + {"".join(node_result)}
- """.format("".join(node_result)) + """ -def json2collapsed_tree(paragraph: 'TreeNode') -> str: - result = """ +def json2collapsed_tree(paragraph: TreeNode) -> str: + result = f""" @@ -66,15 +64,15 @@ def json2collapsed_tree(paragraph: 'TreeNode') -> str: - {} + {_node2tree(paragraph, depth=0)} - """.format(_node2tree(paragraph, depth=0)) + """ return result -def json2tree(paragraph: 'TreeNode') -> str: +def json2tree(paragraph: TreeNode) -> str: stack = [paragraph] nodes = [] while len(stack) > 0: @@ -94,14 +92,13 @@ def json2tree(paragraph: 'TreeNode') -> str: depths = {d for d in depths if d <= depth} space = [space_symbol] * 4 * (depth - 1) + 4 * ["-"] space = __add_vertical_line(depths, space) - node_result.append("{} {}
".format( - space, node.metadata.hierarchy_level.line_type + " " + node.node_id)) + node_result.append(f"{space} {node.metadata.hierarchy_level.line_type} {node.node_id}
") for text in __prettify_text(node.text): space = [space_symbol] * 4 * (depth - 1) + 4 * [space_symbol] space = __add_vertical_line(depths, space) - node_result.append("{} {}
".format(space, text)) + node_result.append(f"{space} {text}
") result.extend(reversed(node_result)) - result.append("{tab} {text} id = {id} ; type = {type}
".format( - tab=" " * tabs, - text=ptext, - type=str(paragraph.metadata.hierarchy_level.line_type), - id=paragraph.node_id - ) + text += f'{" " * tabs} {ptext} id = {paragraph.node_id} ; type = {paragraph.metadata.hierarchy_level.line_type}
' for subparagraph in paragraph.subparagraphs: text = json2html(text=text, paragraph=subparagraph, tables=None, tabs=tabs + 4, table2id=table2id) @@ -175,7 +163,7 @@ def __value2tag(name: str, value: str) -> str: return value -def __annotations2html(paragraph: 'TreeNode', table2id: Dict[str, int]) -> str: +def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int]) -> str: indexes = dict() for annotation in paragraph.annotations: @@ -198,8 +186,7 @@ def __annotations2html(paragraph: 'TreeNode', table2id: Dict[str, int]) -> str: indexes.setdefault(annotation.start, "") indexes.setdefault(annotation.end, "") if name == "table": - indexes[annotation.start] += '' \ - .format(uid=tag, index_table=table2id[tag]) + indexes[annotation.start] += f'' else: indexes[annotation.start] += "<" + tag + ">" indexes[annotation.end] = "" + tag + ">" + indexes[annotation.end] @@ -215,10 +202,8 @@ def __annotations2html(paragraph: 'TreeNode', table2id: Dict[str, int]) -> str: def __table2html(table: Table, table2id: Dict[str, int]) -> str: uid = table.metadata.uid - text = "{}
".format(line) + response += f"{line}
" return HTMLResponse(response, status_code=202) -@app.get('/info_classifiers') +@app.get("/info_classifiers") def get_classifiers_info() -> Response: return FileResponse(os.path.join(static_path, "train_dataset/refit_classifier.html")) -@app.get('/static_file') +@app.get("/static_file") def get_static_file(request: Request) -> Response: path = _get_static_file_path(request) return FileResponse(path) -@app.get('/return-file/{filename}') +@app.get("/return-file/{filename}") def return_files(filename: str) -> Response: file_path = os.path.join(UPLOAD_FOLDER, filename) return FileResponse(file_path) -@app.get('/clear') +@app.get("/clear") def clear() -> Response: shutil.rmtree(config["intermediate_data_path"]) os.makedirs(config["intermediate_data_path"]) diff --git a/dedoc/api/train_dataset/async_archive_handler.py b/dedoc/api/train_dataset/async_archive_handler.py index 43b63b1a..09b6d39d 100644 --- a/dedoc/api/train_dataset/async_archive_handler.py +++ b/dedoc/api/train_dataset/async_archive_handler.py @@ -10,7 +10,7 @@ from fastapi import UploadFile -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException +from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.dedoc_manager import DedocManager from dedoc.train_dataset.taskers.tasker import Tasker @@ -35,19 +35,15 @@ def run(self) -> None: else: uid, parameters, file = self.queue.get() self.results[uid] = self._handle_archive(path=file, parameters=parameters, uid=uid) - self.logger.info("FINISH {}".format(uid)) + self.logger.info(f"FINISH {uid}") def _handle_archive(self, uid: str, path: str, parameters: dict) -> str: try: with zipfile.ZipFile(path, "r") as archive: for i, file in enumerate(archive.namelist()): - self.progress[uid] = "files done\t= {} \n files_in_progress\t= {}\n total\t= {}".format( - i, 1, len(archive.namelist()) - ) + self.progress[uid] = f"files done\t= {i} \n files_in_progress\t= {1}\n total\t= {len(archive.namelist())}" self.__handle_one_file(archive, file, parameters) - self.progress[uid] = "files done\t= {} \n files_in_progress\t= {}\n total\t= {}".format( - i + 1, 0, len(archive.namelist()) - ) + self.progress[uid] = f"files done\t= {i + 1} \n files_in_progress\t= {0}\n total\t= {len(archive.namelist())}" task, _ = self.tasker.create_tasks( type_of_task=parameters["type_of_task"], @@ -56,11 +52,11 @@ def _handle_archive(self, uid: str, path: str, parameters: dict) -> str: ) return task except Exception as e: - self.progress[uid] = "Fail with\n {}".format(e) + self.progress[uid] = f"Fail with\n{e}" raise e def __handle_one_file(self, archive: zipfile.ZipFile, file: str, parameters: dict) -> None: - self.logger.info("Start handle {}".format(file)) + self.logger.info(f"Start handle {file}") with TemporaryDirectory() as tmpdir: try: with archive.open(file) as item: @@ -71,9 +67,9 @@ def __handle_one_file(self, archive: zipfile.ZipFile, file: str, parameters: dic with open(path_out, "wb") as file_out: file_out.write(item.read()) self.manager.parse(file_path=path_out, parameters=parameters) - except BadFileFormatException as e: - self.logger.warning("Can't handle file {}, exception {}".format(file, str(e))) - self.logger.info("Finish handle {}".format(file)) + except BadFileFormatError as e: + self.logger.warning(f"Can't handle file {file}, exception {str(e)}") + self.logger.info(f"Finish handle {file}") class AsyncHandler: diff --git a/dedoc/attachments_extractors/abstract_attachment_extractor.py b/dedoc/attachments_extractors/abstract_attachment_extractor.py index 2c5d5643..55fb06e9 100644 --- a/dedoc/attachments_extractors/abstract_attachment_extractor.py +++ b/dedoc/attachments_extractors/abstract_attachment_extractor.py @@ -1,7 +1,7 @@ import os import uuid from abc import ABC, abstractmethod -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple from dedoc.data_structures.attached_file import AttachedFile from dedoc.utils.utils import save_data_to_unique_file @@ -48,10 +48,7 @@ def with_attachments(parameters: dict) -> bool: """ return str(parameters.get("with_attachments", "false")).lower() == "true" - def _content2attach_file(self, - content: List[Tuple[str, bytes]], - tmpdir: str, - need_content_analysis: bool) -> List[AttachedFile]: + def _content2attach_file(self, content: List[Tuple[str, bytes]], tmpdir: str, need_content_analysis: bool) -> List[AttachedFile]: attachments = [] for original_name, contents in content: tmp_file_name = save_data_to_unique_file(directory=tmpdir, filename=original_name, binary_data=contents) diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py index 171a233c..15f4adcb 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py @@ -63,7 +63,7 @@ def __parse_ole_contents(self, stream: bytes) -> Tuple[str, bytes]: def _get_attachments(self, tmpdir: str, filename: str, parameters: dict, attachments_dir: str) -> List[AttachedFile]: result = [] - with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile: + with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile: files = zfile.namelist() attachments = [file for file in files if file.startswith((f"{attachments_dir}/media/", f"{attachments_dir}/embeddings/"))] @@ -71,10 +71,10 @@ def _get_attachments(self, tmpdir: str, filename: str, parameters: dict, attachm original_name = os.path.split(attachment)[-1] # these are windows metafile extensions - if original_name.endswith(('.emf', 'wmf')): + if original_name.endswith((".emf", "wmf")): continue - if not original_name.endswith('.bin'): + if not original_name.endswith(".bin"): result.append((original_name, zfile.read(attachment))) else: with zfile.open(attachment) as f: @@ -82,9 +82,9 @@ def _get_attachments(self, tmpdir: str, filename: str, parameters: dict, attachm # extracting PDF-files if ole.exists("CONTENTS"): - data = ole.openstream('CONTENTS').read() - if data[0:5] == b'%PDF-': - result.append((os.path.splitext(original_name)[-2] + '.pdf', data)) + data = ole.openstream("CONTENTS").read() + if data[0:5] == b"%PDF-": + result.append((f"{os.path.splitext(original_name)[-2]}.pdf", data)) # extracting files in other formats elif ole.exists("\x01Ole10Native"): diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py index 93959faf..5c9be9c9 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py @@ -8,7 +8,7 @@ from bs4 import BeautifulSoup, Tag from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException +from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.attached_file import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes @@ -32,7 +32,7 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ """ result = [] try: - with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile: + with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile: diagram_attachments = self.__extract_diagrams(zfile) need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true" result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis) @@ -40,7 +40,7 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word") except zipfile.BadZipFile: - raise BadFileFormatException("Bad docx file:\n file_name = {}. Seems docx is broken".format(filename)) + raise BadFileFormatError(f"Bad docx file:\n file_name = {filename}. Seems docx is broken") return result def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]: @@ -52,12 +52,12 @@ def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]: """ result = [] try: - content = document.read('word/document.xml') + content = document.read("word/document.xml") except KeyError: - content = document.read('word/document2.xml') + content = document.read("word/document2.xml") content = re.sub(br"\n[\t ]*", b"", content) - bs = BeautifulSoup(content, 'xml') + bs = BeautifulSoup(content, "xml") paragraphs = [p for p in bs.body] diagram_paragraphs = [] @@ -81,10 +81,10 @@ def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]: paragraph = p.extract() uid = hashlib.md5(paragraph.encode()).hexdigest() - with open(f'{tmpdir}/word/document.xml', 'w') as f: + with open(f"{tmpdir}/word/document.xml", "w") as f: f.write(doc_text) diagram_name = f"{uid}.docx" - with zipfile.ZipFile(os.path.join(tmpdir, diagram_name), mode='w') as new_d: + with zipfile.ZipFile(os.path.join(tmpdir, diagram_name), mode="w") as new_d: for filename in namelist: new_d.write(os.path.join(tmpdir, filename), arcname=filename) with open(os.path.join(tmpdir, diagram_name), "rb") as f: diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py index ff9507c4..bbcf1953 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py @@ -25,7 +25,7 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ """ attachments = [] name, ext = splitext_(filename) - if ext.lower() != '.xlsx': + if ext.lower() != ".xlsx": return attachments return self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="xl") diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py index bd6767e7..706e34c4 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py @@ -1,5 +1,5 @@ -import os import json +import os from typing import List, Optional from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor @@ -42,17 +42,17 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ for keys in field_keys: path = json.dumps(keys, ensure_ascii=False) - attached_filename = path + '.html' + attached_filename = f"{path}.html" attachment_file_path = os.path.join(tmpdir, attached_filename) field_content = self.__get_value_by_keys(data, keys) if not isinstance(field_content, str): continue - with open(attachment_file_path, 'w') as f: + with open(attachment_file_path, "w") as f: f.write(field_content) - with open(attachment_file_path, mode='rb') as f: + with open(attachment_file_path, mode="rb") as f: binary_data = f.read() attachments.append((attached_filename, binary_data)) diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py index 35127a4f..9cc35b6e 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py @@ -1,17 +1,17 @@ import logging import os import uuid -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple import PyPDF2 from PyPDF2.pdf import PageObject from PyPDF2.utils import PdfReadError from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor +from dedoc.attachments_extractors.utils import create_note from dedoc.data_structures.attached_file import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.utils.utils import convert_datetime -from dedoc.attachments_extractors.utils import create_note class PDFAttachmentsExtractor(AbstractAttachmentsExtractor): @@ -38,7 +38,7 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ - with open(os.path.join(tmpdir, filename), 'rb') as handler: + with open(os.path.join(tmpdir, filename), "rb") as handler: try: reader = PyPDF2.PdfFileReader(handler) except Exception as e: @@ -59,25 +59,22 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]: attachments = [] - if '/Annots' in page.keys(): - for annot in page['/Annots']: + if "/Annots" in page.keys(): + for annot in page["/Annots"]: # Other subtypes, such as /Link, cause errors - subtype = annot.getObject().get('/Subtype') + subtype = annot.getObject().get("/Subtype") if subtype == "/FileAttachment": - name = annot.getObject()['/FS']['/UF'] - data = annot.getObject()['/FS']['/EF']['/F'].getData() # The file containing the stream data. + name = annot.getObject()["/FS"]["/UF"] + data = annot.getObject()["/FS"]["/EF"]["/F"].getData() # The file containing the stream data. attachments.append([name, data]) - if subtype == "/Text" and annot.getObject().get('/Name') == '/Comment': # it is messages (notes) in PDF + if subtype == "/Text" and annot.getObject().get("/Name") == "/Comment": # it is messages (notes) in PDF note = annot.getObject() created_time = convert_datetime(note["/CreationDate"]) if "/CreationDate" in note else None modified_time = convert_datetime(note["/M"]) if "/M" in note else None user = note.get("/T") data = note.get("/Contents", "") - name, content = create_note(content=data, - modified_time=modified_time, - created_time=created_time, - author=user) + name, content = create_note(content=data, modified_time=modified_time, created_time=created_time, author=user) attachments.append((name, bytes(content))) return attachments @@ -99,15 +96,15 @@ def __get_root_attachments(self, reader: PyPDF2.PdfFileReader) -> List[Tuple[str """ attachments = [] catalog = reader.trailer["/Root"] - if '/Names' in catalog.keys() and '/EmbeddedFiles' in catalog['/Names'].keys() and '/Names' in catalog['/Names']['/EmbeddedFiles'].keys(): - file_names = catalog['/Names']['/EmbeddedFiles']['/Names'] + if "/Names" in catalog.keys() and "/EmbeddedFiles" in catalog["/Names"].keys() and "/Names" in catalog["/Names"]["/EmbeddedFiles"].keys(): + file_names = catalog["/Names"]["/EmbeddedFiles"]["/Names"] for f in file_names: if isinstance(f, str): data_index = file_names.index(f) + 1 dict_object = file_names[data_index].getObject() - if '/EF' in dict_object and '/F' in dict_object['/EF']: - data = dict_object['/EF']['/F'].getData() - name = dict_object.get('/UF', "pdf_attach_{}".format(uuid.uuid1())) + if "/EF" in dict_object and "/F" in dict_object["/EF"]: + data = dict_object["/EF"]["/F"].getData() + name = dict_object.get("/UF", f"pdf_attach_{uuid.uuid1()}") attachments.append((name, data)) return attachments diff --git a/dedoc/attachments_extractors/utils.py b/dedoc/attachments_extractors/utils.py index acc64fff..679677e9 100644 --- a/dedoc/attachments_extractors/utils.py +++ b/dedoc/attachments_extractors/utils.py @@ -10,6 +10,6 @@ def create_note(content: str, modified_time: int, created_time: int, author: str "created_time": created_time, "size": size if size else len(content), "author": author} - encode_data = json.dumps(note_dict).encode('utf-8') + encode_data = json.dumps(note_dict).encode("utf-8") return filename, encode_data diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py index 8dcfce0a..1392277a 100644 --- a/dedoc/attachments_handler/attachments_handler.py +++ b/dedoc/attachments_handler/attachments_handler.py @@ -7,8 +7,8 @@ from typing import List from dedoc.attachments_extractors import AbstractAttachmentsExtractor -from dedoc.common.exceptions.dedoc_exception import DedocException -from dedoc.data_structures import ParsedDocument, DocumentMetadata, AttachedFile +from dedoc.common.exceptions.dedoc_error import DedocError +from dedoc.data_structures import AttachedFile, DocumentMetadata, ParsedDocument from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.utils.utils import get_empty_content @@ -73,7 +73,7 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct parsed_file = document_parser.parse(attachment_path, parameters=parameters_copy) else: parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy) - except DedocException: + except DedocError: # return empty ParsedDocument with Meta information parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy) diff --git a/dedoc/common/exceptions/bad_file_error.py b/dedoc/common/exceptions/bad_file_error.py new file mode 100644 index 00000000..4b800c9d --- /dev/null +++ b/dedoc/common/exceptions/bad_file_error.py @@ -0,0 +1,19 @@ +from typing import Optional + +from dedoc.common.exceptions.dedoc_error import DedocError + + +class BadFileFormatError(DedocError): + """ + Raise if given file can't be handled by the system (for example if no reader can read this file) + """ + + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: + super(BadFileFormatError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + + def __str__(self) -> str: + return f"BadFileFormatError({self.msg})" + + @property + def code(self) -> int: + return 415 diff --git a/dedoc/common/exceptions/bad_file_exception.py b/dedoc/common/exceptions/bad_file_exception.py deleted file mode 100644 index 0aeea0e1..00000000 --- a/dedoc/common/exceptions/bad_file_exception.py +++ /dev/null @@ -1,23 +0,0 @@ -from typing import Optional - -from dedoc.common.exceptions.dedoc_exception import DedocException - - -class BadFileFormatException(DedocException): - """ - Raise if given file can't be handled by the system (for example if no reader can read this file) - """ - - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: - super(BadFileFormatException, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return "BadFileException({})".format(self.msg) - - @property - def code(self) -> int: - return 415 diff --git a/dedoc/common/exceptions/bad_parameters_error.py b/dedoc/common/exceptions/bad_parameters_error.py new file mode 100644 index 00000000..dc8c0aa9 --- /dev/null +++ b/dedoc/common/exceptions/bad_parameters_error.py @@ -0,0 +1,20 @@ +from typing import Optional + +from dedoc.common.exceptions.dedoc_error import DedocError + + +class BadParametersError(DedocError): + """ + Raise if given parameters are incorrect and can't be handled by the system + (for example if string provided instead of bool) + """ + + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: + super(BadParametersError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + + def __str__(self) -> str: + return f"BadParametersError({self.msg})" + + @property + def code(self) -> int: + return 400 diff --git a/dedoc/common/exceptions/bad_parameters_exception.py b/dedoc/common/exceptions/bad_parameters_exception.py deleted file mode 100644 index 626d5d82..00000000 --- a/dedoc/common/exceptions/bad_parameters_exception.py +++ /dev/null @@ -1,24 +0,0 @@ -from typing import Optional - -from dedoc.common.exceptions.dedoc_exception import DedocException - - -class BadParametersException(DedocException): - """ - Raise if given parameters are incorrect and can't be handled by the system - (for example if string provided instead of bool) - """ - - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: - super(BadParametersException, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return "BadParametersException({})".format(self.msg) - - @property - def code(self) -> int: - return 400 diff --git a/dedoc/common/exceptions/conversion_error.py b/dedoc/common/exceptions/conversion_error.py new file mode 100644 index 00000000..f95207b3 --- /dev/null +++ b/dedoc/common/exceptions/conversion_error.py @@ -0,0 +1,19 @@ +from typing import Optional + +from dedoc.common.exceptions.dedoc_error import DedocError + + +class ConversionError(DedocError): + """ + Can be raised if conversion of the file ended unsuccessfully or didn't finish at all (converter terminated the process) + """ + + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: + super(ConversionError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + + def __str__(self) -> str: + return f"ConversionError({self.msg})" + + @property + def code(self) -> int: + return 415 diff --git a/dedoc/common/exceptions/conversion_exception.py b/dedoc/common/exceptions/conversion_exception.py deleted file mode 100644 index b71b9f5a..00000000 --- a/dedoc/common/exceptions/conversion_exception.py +++ /dev/null @@ -1,23 +0,0 @@ -from typing import Optional - -from dedoc.common.exceptions.dedoc_exception import DedocException - - -class ConversionException(DedocException): - """ - Can be raised if conversion of the file ended unsuccessfully or didn't finish at all (converter terminated the process) - """ - - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: - super(ConversionException, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return f"ConversionException({self.msg})" - - @property - def code(self) -> int: - return 415 diff --git a/dedoc/common/exceptions/dedoc_exception.py b/dedoc/common/exceptions/dedoc_error.py similarity index 81% rename from dedoc/common/exceptions/dedoc_exception.py rename to dedoc/common/exceptions/dedoc_error.py index 1b9cbf5d..78426e39 100644 --- a/dedoc/common/exceptions/dedoc_exception.py +++ b/dedoc/common/exceptions/dedoc_error.py @@ -3,14 +3,14 @@ import dedoc -class DedocException(Exception): +class DedocError(Exception): def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None, metadata: Optional[dict] = None) -> None: - super(DedocException, self).__init__() + super(DedocError, self).__init__() self.msg = msg self.msg_api = msg if msg_api is None else msg_api self.filename = filename @@ -18,7 +18,7 @@ def __init__(self, self.metadata = metadata def __str__(self) -> str: - return "MissingFileException({})".format(self.msg) + return f"DedocError({self.msg})" @property def code(self) -> int: diff --git a/dedoc/common/exceptions/java_not_found_error.py b/dedoc/common/exceptions/java_not_found_error.py index 62b426fb..c6d96384 100644 --- a/dedoc/common/exceptions/java_not_found_error.py +++ b/dedoc/common/exceptions/java_not_found_error.py @@ -1,21 +1,18 @@ from typing import Optional -from dedoc.common.exceptions.dedoc_exception import DedocException +from dedoc.common.exceptions.dedoc_error import DedocError -class JavaNotFoundError(DedocException): + +class JavaNotFoundError(DedocError): """ - raise if there is no JAVA + Raise if there is no JAVA """ - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: super(JavaNotFoundError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) def __str__(self) -> str: - return "JavaNotFoundError({})".format(self.msg) + return f"JavaNotFoundError({self.msg})" @property def code(self) -> int: diff --git a/dedoc/common/exceptions/minio_error.py b/dedoc/common/exceptions/minio_error.py new file mode 100644 index 00000000..6d43c64f --- /dev/null +++ b/dedoc/common/exceptions/minio_error.py @@ -0,0 +1,19 @@ +from typing import Optional + +from dedoc.common.exceptions.dedoc_error import DedocError + + +class MinioError(DedocError): + """ + Raise if there is no file in minio + """ + + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: + super(MinioError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + + def __str__(self) -> str: + return f"MinioError({self.msg})" + + @property + def code(self) -> int: + return 404 diff --git a/dedoc/common/exceptions/minio_exception.py b/dedoc/common/exceptions/minio_exception.py deleted file mode 100644 index a19ae189..00000000 --- a/dedoc/common/exceptions/minio_exception.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Optional -from dedoc.common.exceptions.dedoc_exception import DedocException - - -class MinioException(DedocException): - """ - raise if there is no file in minio - """ - - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: - super(MinioException, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return "MinioException({})".format(self.msg) - - @property - def code(self) -> int: - return 404 diff --git a/dedoc/common/exceptions/missing_file_error.py b/dedoc/common/exceptions/missing_file_error.py new file mode 100644 index 00000000..7bc861e9 --- /dev/null +++ b/dedoc/common/exceptions/missing_file_error.py @@ -0,0 +1,19 @@ +from typing import Optional + +from dedoc.common.exceptions.dedoc_error import DedocError + + +class MissingFileError(DedocError): + """ + Raise if there is no file in post request + """ + + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: + super(MissingFileError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + + def __str__(self) -> str: + return f"MissingFileError({self.msg})" + + @property + def code(self) -> int: + return 400 diff --git a/dedoc/common/exceptions/missing_file_exception.py b/dedoc/common/exceptions/missing_file_exception.py deleted file mode 100644 index f6b9d654..00000000 --- a/dedoc/common/exceptions/missing_file_exception.py +++ /dev/null @@ -1,23 +0,0 @@ -from typing import Optional - -from dedoc.common.exceptions.dedoc_exception import DedocException - - -class MissingFileException(DedocException): - """ - raise if there is no file in post request - """ - - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: - super(MissingFileException, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return "MissingFileException({})".format(self.msg) - - @property - def code(self) -> int: - return 400 diff --git a/dedoc/common/exceptions/recognize_error.py b/dedoc/common/exceptions/recognize_error.py new file mode 100644 index 00000000..05c388ce --- /dev/null +++ b/dedoc/common/exceptions/recognize_error.py @@ -0,0 +1,16 @@ +from typing import Optional + +from dedoc.common.exceptions.dedoc_error import DedocError + + +class RecognizeError(DedocError): + + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: + super(RecognizeError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + + def __str__(self) -> str: + return f"RecognizeError({self.msg})" + + @property + def code(self) -> int: + return 500 diff --git a/dedoc/common/exceptions/recognize_exception.py b/dedoc/common/exceptions/recognize_exception.py deleted file mode 100644 index 8e62147e..00000000 --- a/dedoc/common/exceptions/recognize_exception.py +++ /dev/null @@ -1,20 +0,0 @@ -from typing import Optional - -from dedoc.common.exceptions.dedoc_exception import DedocException - - -class RecognizeException(DedocException): - - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: - super(RecognizeException, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return "RecognizeException({})".format(self.msg) - - @property - def code(self) -> int: - return 500 diff --git a/dedoc/common/exceptions/structure_extractor_error.py b/dedoc/common/exceptions/structure_extractor_error.py new file mode 100644 index 00000000..1bb9bd00 --- /dev/null +++ b/dedoc/common/exceptions/structure_extractor_error.py @@ -0,0 +1,19 @@ +from typing import Optional + +from dedoc.common.exceptions.dedoc_error import DedocError + + +class StructureExtractorError(DedocError): + """ + Raise if structure extractor can't build structured document from unstructured one. + """ + + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: + super(StructureExtractorError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + + def __str__(self) -> str: + return f"StructureExtractorError({self.msg})" + + @property + def code(self) -> int: + return 400 diff --git a/dedoc/common/exceptions/structure_extractor_exception.py b/dedoc/common/exceptions/structure_extractor_exception.py deleted file mode 100644 index 76d738a4..00000000 --- a/dedoc/common/exceptions/structure_extractor_exception.py +++ /dev/null @@ -1,23 +0,0 @@ -from typing import Optional - -from dedoc.common.exceptions.dedoc_exception import DedocException - - -class StructureExtractorException(DedocException): - """ - Raise if structure extractor can't build structured document from unstructured one. - """ - - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: - super(StructureExtractorException, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return "StructureExtractorException({})".format(self.msg) - - @property - def code(self) -> int: - return 400 diff --git a/dedoc/common/exceptions/tabby_pdf_error.py b/dedoc/common/exceptions/tabby_pdf_error.py index 1dbb008c..eff2ec8d 100644 --- a/dedoc/common/exceptions/tabby_pdf_error.py +++ b/dedoc/common/exceptions/tabby_pdf_error.py @@ -1,21 +1,18 @@ from typing import Optional -from dedoc.common.exceptions.dedoc_exception import DedocException +from dedoc.common.exceptions.dedoc_error import DedocError -class TabbyPdfError(DedocException): + +class TabbyPdfError(DedocError): """ Error from TabbyPDF """ - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: super(TabbyPdfError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) def __str__(self) -> str: - return "TabbyPdfError({})".format(self.msg) + return f"TabbyPdfError({self.msg})" @property def code(self) -> int: diff --git a/dedoc/config.py b/dedoc/config.py index 2664e7dd..34f51297 100644 --- a/dedoc/config.py +++ b/dedoc/config.py @@ -2,14 +2,14 @@ import logging import os import sys -from typing import Optional, Any +from typing import Any, Optional logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s") DEBUG_MODE = False -RESOURCES_PATH = os.environ.get('RESOURCES_PATH', os.path.join(os.path.expanduser('~'), ".cache", "dedoc", "resources")) +RESOURCES_PATH = os.environ.get("RESOURCES_PATH", os.path.join(os.path.expanduser("~"), ".cache", "dedoc", "resources")) _config = dict( # -----------------------------------------RESOURCES PATH SETTINGS---------------------------------------------------- @@ -28,7 +28,7 @@ # max file size in bytes max_content_length=512 * 1024 * 1024, # application port - api_port=int(os.environ.get('DOCREADER_PORT', '1231')), + api_port=int(os.environ.get("DOCREADER_PORT", "1231")), static_files_dirs={}, # log settings logger=logging.getLogger(), @@ -65,7 +65,7 @@ class Configuration(object): __config = None @classmethod - def getInstance(cls: "Configuration") -> "Configuration": + def get_instance(cls: "Configuration") -> "Configuration": """ Actual object creation will happen when we use Configuration.getInstance() """ @@ -74,7 +74,7 @@ def getInstance(cls: "Configuration") -> "Configuration": return cls.__instance - def __initConfig(self, args: Optional[Any] = None) -> None: + def __init_config(self, args: Optional[Any] = None) -> None: if args is not None and args.config_path is not None: spec = importlib.util.spec_from_file_location("config_module", args.config_path) config_module = importlib.util.module_from_spec(spec) @@ -83,11 +83,11 @@ def __initConfig(self, args: Optional[Any] = None) -> None: else: self.__config = _config - def getConfig(self, args: Optional[Any] = None) -> dict: + def get_config(self, args: Optional[Any] = None) -> dict: if self.__config is None or args is not None: - self.__initConfig(args) + self.__init_config(args) return self.__config def get_config(args: Optional[Any] = None) -> dict: - return Configuration.getInstance().getConfig(args) + return Configuration.get_instance().get_config(args) diff --git a/dedoc/converters/concrete_converters/abstract_converter.py b/dedoc/converters/concrete_converters/abstract_converter.py index 2be165be..8dc05d8b 100644 --- a/dedoc/converters/concrete_converters/abstract_converter.py +++ b/dedoc/converters/concrete_converters/abstract_converter.py @@ -3,9 +3,9 @@ import subprocess import time from abc import ABC, abstractmethod -from typing import Optional, List +from typing import List, Optional -from dedoc.common.exceptions.conversion_exception import ConversionException +from dedoc.common.exceptions.conversion_error import ConversionError class AbstractConverter(ABC): @@ -57,12 +57,12 @@ def _run_subprocess(self, command: List[str], filename: str, expected_path: str) else: error_message = f"Could not convert file {filename}\n{error_message}" self.logger.error(error_message) - raise ConversionException(msg=error_message) + raise ConversionError(msg=error_message) except subprocess.TimeoutExpired: message = f"Conversion of the {filename} hadn't terminated after {self.timeout} seconds" self.logger.error(message) - raise ConversionException(msg=message) + raise ConversionError(msg=message) def _await_for_conversion(self, filename: str, tmp_dir: str) -> None: t = 0 @@ -71,4 +71,4 @@ def _await_for_conversion(self, filename: str, tmp_dir: str) -> None: t += self.period_checking if t >= self.timeout: - raise ConversionException(msg=f"fail with {tmp_dir}/{filename}", msg_api=f"Unsupported file format {filename}") + raise ConversionError(msg=f"fail with {tmp_dir}/{filename}", msg_api=f"Unsupported file format {filename}") diff --git a/dedoc/converters/concrete_converters/binary_converter.py b/dedoc/converters/concrete_converters/binary_converter.py index 183fc966..2089d66c 100644 --- a/dedoc/converters/concrete_converters/binary_converter.py +++ b/dedoc/converters/concrete_converters/binary_converter.py @@ -1,8 +1,8 @@ from typing import Optional -from dedoc.utils import supported_image_types from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.converters.concrete_converters.png_converter import PNGConverter +from dedoc.utils import supported_image_types class BinaryConverter(AbstractConverter): @@ -18,7 +18,7 @@ def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = No """ Checks if the document is image-like (e.g. it has .bmp, .jpg, .tiff, etc. extension) and has `mime=application/octet-stream`. """ - return mime == 'application/octet-stream' and extension in supported_image_types + return mime == "application/octet-stream" and extension in supported_image_types def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: """ diff --git a/dedoc/converters/concrete_converters/docx_converter.py b/dedoc/converters/concrete_converters/docx_converter.py index a32af9b9..0a6abd9c 100644 --- a/dedoc/converters/concrete_converters/docx_converter.py +++ b/dedoc/converters/concrete_converters/docx_converter.py @@ -2,7 +2,7 @@ from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.extensions import converted_mimes, converted_extensions +from dedoc.extensions import converted_extensions, converted_mimes class DocxConverter(AbstractConverter): @@ -23,9 +23,9 @@ def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: """ Convert the docx-like documents into files with .docx extension using the soffice application. """ - path_in = f"{tmp_dir}/{filename}{extension}" + path_in = os.path.join(tmp_dir, f"{filename}{extension}") command = ["soffice", "--headless", "--convert-to", "docx", "--outdir", tmp_dir, path_in] - file_out = filename + ".docx" + file_out = f"{filename}.docx" expected_path = os.path.join(tmp_dir, file_out) self._run_subprocess(command=command, filename=filename, expected_path=expected_path) diff --git a/dedoc/converters/concrete_converters/excel_converter.py b/dedoc/converters/concrete_converters/excel_converter.py index bd9ca793..661fb5c2 100644 --- a/dedoc/converters/concrete_converters/excel_converter.py +++ b/dedoc/converters/concrete_converters/excel_converter.py @@ -23,9 +23,9 @@ def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: """ Convert the xlsx-like documents into files with .xlsx extension using the soffice application. """ - path_in = f"{tmp_dir}/{filename}{extension}" + path_in = os.path.join(tmp_dir, f"{filename}{extension}") command = ["soffice", "--headless", "--convert-to", "xlsx", "--outdir", tmp_dir, path_in] - file_out = filename + '.xlsx' + file_out = f"{filename}.xlsx" expected_path = os.path.join(tmp_dir, file_out) self._run_subprocess(command=command, filename=filename, expected_path=expected_path) diff --git a/dedoc/converters/concrete_converters/pdf_converter.py b/dedoc/converters/concrete_converters/pdf_converter.py index 380fd508..1be6d839 100644 --- a/dedoc/converters/concrete_converters/pdf_converter.py +++ b/dedoc/converters/concrete_converters/pdf_converter.py @@ -1,3 +1,4 @@ +import os from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter @@ -23,9 +24,9 @@ def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: """ Convert the pdf-like documents into files with .pdf extension using the ddjvu application. """ - path_in = f"{tmp_dir}/{filename}{extension}" - expected_path = f"{tmp_dir}/{filename}.pdf" + path_in = os.path.join(tmp_dir, f"{filename}{extension}") + expected_path = os.path.join(tmp_dir, f"{filename}.pdf") command = ["ddjvu", "--format=pdf", path_in, expected_path] self._run_subprocess(command=command, filename=filename, expected_path=expected_path) - return filename + '.pdf' + return filename + ".pdf" diff --git a/dedoc/converters/concrete_converters/png_converter.py b/dedoc/converters/concrete_converters/png_converter.py index f51b0426..3fdcac26 100644 --- a/dedoc/converters/concrete_converters/png_converter.py +++ b/dedoc/converters/concrete_converters/png_converter.py @@ -1,3 +1,4 @@ +import os from typing import Optional import cv2 @@ -25,11 +26,13 @@ def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: """ Convert the image-like documents into files with .png extension. """ - if extension in ['.hdr', '.pic', '.sr', '.ras', '.j2k']: - img = cv2.imread(f"{tmp_dir}/{filename}{extension}") - cv2.imwrite(f"{tmp_dir}/{filename}.png", img) + path_in = os.path.join(tmp_dir, f"{filename}{extension}") + path_out = os.path.join(tmp_dir, f"{filename}.png") + if extension in [".hdr", ".pic", ".sr", ".ras", ".j2k"]: + img = cv2.imread(path_in) + cv2.imwrite(path_out, img) else: - img = Image.open(f"{tmp_dir}/{filename}{extension}") - img.save(f"{tmp_dir}/{filename}.png") + img = Image.open(path_in) + img.save(path_out) - return filename + ".png" + return f"{filename}.png" diff --git a/dedoc/converters/concrete_converters/pptx_converter.py b/dedoc/converters/concrete_converters/pptx_converter.py index a9ab630d..312791fe 100644 --- a/dedoc/converters/concrete_converters/pptx_converter.py +++ b/dedoc/converters/concrete_converters/pptx_converter.py @@ -23,9 +23,9 @@ def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: """ Convert the pptx-like documents into files with .pptx extension using the soffice application. """ - path_in = "{tmp_dir}/{filename}{extension}".format(tmp_dir=tmp_dir, extension=extension, filename=filename) + path_in = os.path.join(tmp_dir, f"{filename}{extension}") command = ["soffice", "--headless", "--convert-to", "pptx", "--outdir", tmp_dir, path_in] - file_out = filename + '.pptx' + file_out = f"{filename}.pptx" expected_path = os.path.join(tmp_dir, file_out) self._run_subprocess(command=command, filename=filename, expected_path=expected_path) diff --git a/dedoc/converters/file_converter.py b/dedoc/converters/file_converter.py index 43b78899..7048d0ac 100644 --- a/dedoc/converters/file_converter.py +++ b/dedoc/converters/file_converter.py @@ -1,11 +1,9 @@ -import inspect import os -import warnings from stat import S_IREAD, S_IRGRP, S_IROTH from typing import List, Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.utils.utils import splitext_, get_file_mime_type +from dedoc.utils.utils import get_file_mime_type, splitext_ class FileConverterComposition(object): @@ -35,13 +33,7 @@ def do_converting(self, tmp_dir: str, filename: str, parameters: Optional[dict] name, extension = splitext_(filename) mime = get_file_mime_type(os.path.join(tmp_dir, filename)) for converter in self.converters: - if "parameters" in inspect.getfullargspec(converter.can_convert).args: - can_convert = converter.can_convert(extension=extension, mime=mime, parameters=parameters) - else: - warnings.warn("!WARNING! you converter requires an update\n" + - "Please specify parameters argument in method can_convert in {}\n".format(type(converter).__name__) + - " These parameters would be mandatory in the near future") - can_convert = converter.can_convert(extension=extension, mime=mime) + can_convert = converter.can_convert(extension=extension, mime=mime, parameters=parameters) if can_convert: filename = converter.do_convert(tmp_dir, name, extension) break diff --git a/dedoc/data_structures/annotation.py b/dedoc/data_structures/annotation.py index 4c102e86..11bffc01 100644 --- a/dedoc/data_structures/annotation.py +++ b/dedoc/data_structures/annotation.py @@ -35,10 +35,10 @@ def __eq__(self, o: object) -> bool: return self.name == o.name and self.value == o.value and self.start == o.start and self.end == o.end def __str__(self) -> str: - return "{name}({start}:{end}, {value})".format(name=self.name.capitalize(), start=self.start, end=self.end, value=self.value) + return f"{self.name.capitalize()}({self.start}:{self.end}, {self.value})" def __repr__(self) -> str: - return "{name}(...)".format(name=self.name.capitalize()) + return f"{self.name.capitalize()}(...)" def to_dict(self) -> dict: res = OrderedDict() @@ -52,12 +52,12 @@ def to_dict(self) -> dict: def get_api_dict(api: Api) -> Model: names = ["style", "bold", "italic", "underlined", "size", "indentation", "alignment", "table", "attachment", "spacing", "strike", "subscript", "superscript"] - return api.model('Annotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'name': fields.String(description='annotation name', required=True, example='bold', enum=names), - 'value': fields.String(description='annotation value. For example, it may be font size value for size type ' - 'or type of alignment for alignment type', + return api.model("Annotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "name": fields.String(description="annotation name", required=True, example="bold", enum=names), + "value": fields.String(description="annotation value. For example, it may be font size value for size type " + "or type of alignment for alignment type", required=True, example="left") }) diff --git a/dedoc/data_structures/bbox.py b/dedoc/data_structures/bbox.py index 709b54cf..dfdf34b0 100644 --- a/dedoc/data_structures/bbox.py +++ b/dedoc/data_structures/bbox.py @@ -1,5 +1,5 @@ from collections import OrderedDict -from typing import Tuple, Dict +from typing import Dict, Tuple from dedoc.data_structures.serializable import Serializable @@ -46,7 +46,7 @@ def y_bottom_right(self) -> int: return self.y_top_left + self.height def __str__(self) -> str: - return "BBox(x = {} y = {}, w = {}, h = {})".format(self.x_top_left, self.y_top_left, self.width, self.height) + return f"BBox(x = {self.x_top_left} y = {self.y_top_left}, w = {self.width}, h = {self.height})" def __repr__(self) -> str: return self.__str__() @@ -68,10 +68,7 @@ def from_two_points(top_left: Tuple[int, int], bottom_right: Tuple[int, int]) -> """ x_top_left, y_top_left = top_left x_bottom_right, y_bottom_right = bottom_right - return BBox(x_top_left=x_top_left, - y_top_left=y_top_left, - width=x_bottom_right - x_top_left, - height=y_bottom_right - y_top_left) + return BBox(x_top_left=x_top_left, y_top_left=y_top_left, width=x_bottom_right - x_top_left, height=y_bottom_right - y_top_left) def have_intersection_with_box(self, box: "BBox", threshold: float = 0.3) -> bool: """ @@ -81,12 +78,12 @@ def have_intersection_with_box(self, box: "BBox", threshold: float = 0.3) -> boo :param threshold: the lowest value of the intersection over union used get boolean result """ # determine the (x, y)-coordinates of the intersection rectangle - xA = max(self.x_top_left, box.x_top_left) - yA = max(self.y_top_left, box.y_top_left) - xB = min(self.x_top_left + self.width, box.x_top_left + box.width) - yB = min(self.y_top_left + self.height, box.y_top_left + box.height) + x_a = max(self.x_top_left, box.x_top_left) + y_a = max(self.y_top_left, box.y_top_left) + x_b = min(self.x_top_left + self.width, box.x_top_left + box.width) + y_b = min(self.y_top_left + self.height, box.y_top_left + box.height) # compute the area of intersection rectangle - inter_a_area = max(0, xB - xA) * max(0, yB - yA) + inter_a_area = max(0, x_b - x_a) * max(0, y_b - y_a) # compute the area of both the prediction and ground-truth # rectangles box_b_area = float(box.width * box.height) diff --git a/dedoc/data_structures/cell_property.py b/dedoc/data_structures/cell_property.py index 7039e72c..b0186b6c 100644 --- a/dedoc/data_structures/cell_property.py +++ b/dedoc/data_structures/cell_property.py @@ -1,8 +1,8 @@ from collections import OrderedDict from typing import Any -from flask_restx import fields, Api, Model import numpy as np +from flask_restx import Api, Model, fields from dedoc.data_structures.serializable import Serializable @@ -11,7 +11,7 @@ class CellProperty(Serializable): """ This class holds information about the table cell. """ - def __init__(self, cell: Any) -> None: + def __init__(self, cell: Any) -> None: # noqa """ :param cell: class which should contain the following attributes: colspan, rowspan, invisible. """ @@ -28,8 +28,8 @@ def to_dict(self) -> dict: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('CellProperty', { - 'colspan': fields.Integer(description='attribute of union column count'), - 'rowspan': fields.Integer(description='attribute of union row count'), - 'invisible': fields.Boolean(description='flag for cell display (for example: if invisible==true then style=\"display: none\")'), + return api.model("CellProperty", { + "colspan": fields.Integer(description="attribute of union column count"), + "rowspan": fields.Integer(description="attribute of union row count"), + "invisible": fields.Boolean(description='flag for cell display (for example: if invisible==true then style="display: none")'), }) diff --git a/dedoc/data_structures/concrete_annotations/alignment_annotation.py b/dedoc/data_structures/concrete_annotations/alignment_annotation.py index 6f598631..615f8786 100644 --- a/dedoc/data_structures/concrete_annotations/alignment_annotation.py +++ b/dedoc/data_structures/concrete_annotations/alignment_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -22,11 +22,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('AlignmentAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='alignment of the text', - required=True, - example="left", - enum=AlignmentAnnotation.valid_values) + return api.model("AlignmentAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="alignment of the text", required=True, example="left", enum=AlignmentAnnotation.valid_values) }) diff --git a/dedoc/data_structures/concrete_annotations/attach_annotation.py b/dedoc/data_structures/concrete_annotations/attach_annotation.py index c031d949..6b276cbc 100644 --- a/dedoc/data_structures/concrete_annotations/attach_annotation.py +++ b/dedoc/data_structures/concrete_annotations/attach_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import Model, Api, fields +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -21,6 +21,6 @@ def __init__(self, attach_uid: str, start: int, end: int) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('AttachAnnotation', { - 'value': fields.String(description='ref to attachment', required=True, example="attach fafffa145agh") + return api.model("AttachAnnotation", { + "value": fields.String(description="ref to attachment", required=True, example="attach fafffa145agh") }) diff --git a/dedoc/data_structures/concrete_annotations/bbox_annotation.py b/dedoc/data_structures/concrete_annotations/bbox_annotation.py index a74706cb..fdeb145c 100644 --- a/dedoc/data_structures/concrete_annotations/bbox_annotation.py +++ b/dedoc/data_structures/concrete_annotations/bbox_annotation.py @@ -1,8 +1,8 @@ import json -from dedoc.data_structures.annotation import Annotation -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields +from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.bbox import BBox @@ -26,10 +26,10 @@ def __init__(self, start: int, end: int, value: BBox) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('BBoxAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='bounding box of text chunk', + return api.model("BBoxAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="bounding box of text chunk", required=True, example='{"x_top_left": 0, "y_top_left": 0, "width": 70, "height": 20}') }) diff --git a/dedoc/data_structures/concrete_annotations/bold_annotation.py b/dedoc/data_structures/concrete_annotations/bold_annotation.py index 8531a607..871ab166 100644 --- a/dedoc/data_structures/concrete_annotations/bold_annotation.py +++ b/dedoc/data_structures/concrete_annotations/bold_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,11 +24,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('BoldAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='indicator if the text is bold or not', - required=True, - example="True", - enum=BoldAnnotation.valid_values) + return api.model("BoldAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="indicator if the text is bold or not", required=True, example="True", enum=BoldAnnotation.valid_values) }) diff --git a/dedoc/data_structures/concrete_annotations/color_annotation.py b/dedoc/data_structures/concrete_annotations/color_annotation.py index 772fc5fb..4b6983d6 100644 --- a/dedoc/data_structures/concrete_annotations/color_annotation.py +++ b/dedoc/data_structures/concrete_annotations/color_annotation.py @@ -33,4 +33,4 @@ def __init__(self, start: int, end: int, red: float, green: float, blue: float) super().__init__(start=start, end=end, name=ColorAnnotation.name, value=json.dumps(value)) def __str__(self) -> str: - return "ColorAnnotation(red={}, green={}, blue={})".format(self.red, self.green, self.blue) + return f"ColorAnnotation(red={self.red}, green={self.green}, blue={self.blue})" diff --git a/dedoc/data_structures/concrete_annotations/confidence_annotation.py b/dedoc/data_structures/concrete_annotations/confidence_annotation.py index af18120f..d7977935 100644 --- a/dedoc/data_structures/concrete_annotations/confidence_annotation.py +++ b/dedoc/data_structures/concrete_annotations/confidence_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -25,8 +25,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('BoldAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='confidence value', required=True, example="95") + return api.model("BoldAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="confidence value", required=True, example="95") }) diff --git a/dedoc/data_structures/concrete_annotations/indentation_annotation.py b/dedoc/data_structures/concrete_annotations/indentation_annotation.py index 1e431af4..4ecbfd16 100644 --- a/dedoc/data_structures/concrete_annotations/indentation_annotation.py +++ b/dedoc/data_structures/concrete_annotations/indentation_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,10 +24,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('IndentationAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='text indentation in twentieths of a point (1/1440 of an inch)', - required=True, - example="720") + return api.model("IndentationAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="text indentation in twentieths of a point (1/1440 of an inch)", required=True, example="720") }) diff --git a/dedoc/data_structures/concrete_annotations/italic_annotation.py b/dedoc/data_structures/concrete_annotations/italic_annotation.py index 62ed3a28..0cfc83a1 100644 --- a/dedoc/data_structures/concrete_annotations/italic_annotation.py +++ b/dedoc/data_structures/concrete_annotations/italic_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,11 +24,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('ItalicAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='indicator if the text is italic or not', - required=True, - example="True", - enum=ItalicAnnotation.valid_values) + return api.model("ItalicAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="indicator if the text is italic or not", required=True, example="True", enum=ItalicAnnotation.valid_values) }) diff --git a/dedoc/data_structures/concrete_annotations/linked_text_annotation.py b/dedoc/data_structures/concrete_annotations/linked_text_annotation.py index fdcecb76..9bd9228e 100644 --- a/dedoc/data_structures/concrete_annotations/linked_text_annotation.py +++ b/dedoc/data_structures/concrete_annotations/linked_text_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -20,9 +20,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('LinkedTextAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='text, linked to given, for example text of the footnote', - required=True,) + return api.model("LinkedTextAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="text, linked to given, for example text of the footnote", required=True) }) diff --git a/dedoc/data_structures/concrete_annotations/size_annotation.py b/dedoc/data_structures/concrete_annotations/size_annotation.py index 6c42e35a..c82c1df0 100644 --- a/dedoc/data_structures/concrete_annotations/size_annotation.py +++ b/dedoc/data_structures/concrete_annotations/size_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,10 +24,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('SizeAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='the size of the text in points (1/72 of an inch)', - required=True, - example="18.5") + return api.model("SizeAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="the size of the text in points (1/72 of an inch)", required=True, example="18.5") }) diff --git a/dedoc/data_structures/concrete_annotations/spacing_annotation.py b/dedoc/data_structures/concrete_annotations/spacing_annotation.py index 81dace76..ba0c4e1b 100644 --- a/dedoc/data_structures/concrete_annotations/spacing_annotation.py +++ b/dedoc/data_structures/concrete_annotations/spacing_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -19,16 +19,16 @@ def __init__(self, start: int, end: int, value: str) -> None: try: int(value) except ValueError: - raise ValueError("the value of spacing annotation should be a number, get {}".format(value)) + raise ValueError(f"the value of spacing annotation should be a number, get {value}") super().__init__(start=start, end=end, name=SpacingAnnotation.name, value=value) @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('SpacingAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='spacing between the current line and the previous one in ' - 'twentieths of a point or one hundredths of a line', + return api.model("SpacingAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="spacing between the current line and the previous one in " + "twentieths of a point or one hundredths of a line", required=True, example="240") }) diff --git a/dedoc/data_structures/concrete_annotations/strike_annotation.py b/dedoc/data_structures/concrete_annotations/strike_annotation.py index 158353ad..25cc9806 100644 --- a/dedoc/data_structures/concrete_annotations/strike_annotation.py +++ b/dedoc/data_structures/concrete_annotations/strike_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,10 +24,10 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('StrikeAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='indicator if the text is strikethrough or not', + return api.model("StrikeAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="indicator if the text is strikethrough or not", required=True, example="True", enum=StrikeAnnotation.valid_values) diff --git a/dedoc/data_structures/concrete_annotations/style_annotation.py b/dedoc/data_structures/concrete_annotations/style_annotation.py index 2f7fd1c8..234750a3 100644 --- a/dedoc/data_structures/concrete_annotations/style_annotation.py +++ b/dedoc/data_structures/concrete_annotations/style_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -20,10 +20,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('StyleAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='style name', - required=True, - example="heading 1") + return api.model("StyleAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="style name", required=True, example="heading 1") }) diff --git a/dedoc/data_structures/concrete_annotations/subscript_annotation.py b/dedoc/data_structures/concrete_annotations/subscript_annotation.py index 9ca3f2ad..db3edbfe 100644 --- a/dedoc/data_structures/concrete_annotations/subscript_annotation.py +++ b/dedoc/data_structures/concrete_annotations/subscript_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,10 +24,10 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('SuperscriptAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='indicator if the text is subscript ($a_1$ in tex) or not', + return api.model("SuperscriptAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="indicator if the text is subscript ($a_1$ in tex) or not", required=True, example="True", enum=SubscriptAnnotation.valid_values) diff --git a/dedoc/data_structures/concrete_annotations/superscript_annotation.py b/dedoc/data_structures/concrete_annotations/superscript_annotation.py index d2e47dee..98611918 100644 --- a/dedoc/data_structures/concrete_annotations/superscript_annotation.py +++ b/dedoc/data_structures/concrete_annotations/superscript_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,10 +24,10 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('SuperscriptAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='indicator if the text is superscript ($a^1$ in tex) or not', + return api.model("SuperscriptAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="indicator if the text is superscript ($a^1$ in tex) or not", required=True, example="True", enum=SuperscriptAnnotation.valid_values) diff --git a/dedoc/data_structures/concrete_annotations/table_annotation.py b/dedoc/data_structures/concrete_annotations/table_annotation.py index 64217713..8842a84d 100644 --- a/dedoc/data_structures/concrete_annotations/table_annotation.py +++ b/dedoc/data_structures/concrete_annotations/table_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import Model, Api, fields +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -20,6 +20,6 @@ def __init__(self, name: str, start: int, end: int) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('TableAnnotation', { - 'value': fields.String(description='ref to table', required=True, example="table fafffa145agh") + return api.model("TableAnnotation", { + "value": fields.String(description="ref to table", required=True, example="table fafffa145agh") }) diff --git a/dedoc/data_structures/concrete_annotations/underlined_annotation.py b/dedoc/data_structures/concrete_annotations/underlined_annotation.py index b5249a56..e77e397c 100644 --- a/dedoc/data_structures/concrete_annotations/underlined_annotation.py +++ b/dedoc/data_structures/concrete_annotations/underlined_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,10 +24,10 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('UnderlinedAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='indicator if the text is underlined or not', + return api.model("UnderlinedAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="indicator if the text is underlined or not", required=True, example="True", enum=UnderlinedAnnotation.valid_values) diff --git a/dedoc/data_structures/document_content.py b/dedoc/data_structures/document_content.py index a810d82a..3cf7c1bb 100644 --- a/dedoc/data_structures/document_content.py +++ b/dedoc/data_structures/document_content.py @@ -1,7 +1,7 @@ from collections import OrderedDict from typing import List -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.serializable import Serializable from dedoc.data_structures.table import Table @@ -12,7 +12,7 @@ class DocumentContent(Serializable): """ This class holds the document content - structured text and tables. """ - def __init__(self, tables: List[Table], structure: 'TreeNode', warnings: List[str] = None) -> None: + def __init__(self, tables: List[Table], structure: TreeNode, warnings: List[str] = None) -> None: """ :param tables: list of document tables :param structure: tree structure in which content of the document is organized @@ -30,7 +30,7 @@ def to_dict(self) -> dict: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('DocumentContent', { - 'structure': fields.Nested(TreeNode.get_api_dict(api), readonly=True, description='document content structure'), - 'tables': fields.List(fields.Nested(Table.get_api_dict(api), description="tables structure")) + return api.model("DocumentContent", { + "structure": fields.Nested(TreeNode.get_api_dict(api), readonly=True, description="document content structure"), + "tables": fields.List(fields.Nested(Table.get_api_dict(api), description="tables structure")) }) diff --git a/dedoc/data_structures/document_metadata.py b/dedoc/data_structures/document_metadata.py index a05777d3..67be8956 100644 --- a/dedoc/data_structures/document_metadata.py +++ b/dedoc/data_structures/document_metadata.py @@ -1,7 +1,7 @@ import uuid from collections import OrderedDict -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.api.models.custom_fields import wild_any_fields from dedoc.data_structures.serializable import Serializable @@ -43,7 +43,7 @@ def __init__(self, self.other_fields = {} if other_fields is not None and len(other_fields) > 0: self.extend_other_fields(other_fields) - self.uid = "doc_uid_auto_{}".format(uuid.uuid1()) if uid is None else uid + self.uid = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid def set_uid(self, uid: str) -> None: self.uid = uid # noqa @@ -79,14 +79,14 @@ def to_dict(self) -> dict: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('DocumentMetadata', { - "uid": fields.String(description='unique document identifier', example="doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0"), - 'file_name': fields.String(description='file name', example="example.odt"), - 'temporary_file_name': fields.String(description='file name', example="123.odt"), - 'size': fields.Integer(description='file size in bytes', example="20060"), - 'modified_time': fields.Integer(description='modification time of the document in the format UnixTime', example="1590579805"), - 'created_time': fields.Integer(description='creation time of the document in the format UnixTime', example="1590579805"), - 'access_time': fields.Integer(description='file access time in format UnixTime', example="1590579805"), - 'file_type': fields.String(description='mime-type file', example="application/vnd.oasis.opendocument.text"), - '[a-z]*': wild_any_fields + return api.model("DocumentMetadata", { + "uid": fields.String(description="unique document identifier", example="doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0"), + "file_name": fields.String(description="file name", example="example.odt"), + "temporary_file_name": fields.String(description="file name", example="123.odt"), + "size": fields.Integer(description="file size in bytes", example="20060"), + "modified_time": fields.Integer(description="modification time of the document in the format UnixTime", example="1590579805"), + "created_time": fields.Integer(description="creation time of the document in the format UnixTime", example="1590579805"), + "access_time": fields.Integer(description="file access time in format UnixTime", example="1590579805"), + "file_type": fields.String(description="mime-type file", example="application/vnd.oasis.opendocument.text"), + "[a-z]*": wild_any_fields }) diff --git a/dedoc/data_structures/hierarchy_level.py b/dedoc/data_structures/hierarchy_level.py index ab0d2cd7..06b74b50 100644 --- a/dedoc/data_structures/hierarchy_level.py +++ b/dedoc/data_structures/hierarchy_level.py @@ -18,7 +18,7 @@ class HierarchyLevel: toc = "toc" header = "header" toc_item = "toc_item" - list = "list" + list = "list" # noqa list_item = "list_item" bullet_list_item = "bullet_list_item" raw_text = "raw_text" diff --git a/dedoc/data_structures/line_metadata.py b/dedoc/data_structures/line_metadata.py index 05381ba2..ccca5c0a 100644 --- a/dedoc/data_structures/line_metadata.py +++ b/dedoc/data_structures/line_metadata.py @@ -1,7 +1,7 @@ from collections import OrderedDict from typing import Optional -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.api.models.custom_fields import wild_any_fields, wild_forbid_fields from dedoc.data_structures.hierarchy_level import HierarchyLevel @@ -62,12 +62,12 @@ def to_dict(self) -> dict: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('LineMetadata', { - 'paragraph_type': fields.String(description="paragraph type (header, list_item, list) and etc.", required=True, example="header"), - 'page_id': fields.Integer(description="page number of begin paragraph", required=False, example=0), - 'line_id': fields.Integer(description="line number of begin paragraph", required=True, example=13), - '_*': wild_forbid_fields, # don't get private fields - 'tag_hierarchy_level': wild_forbid_fields, - 'hierarchy_level': wild_forbid_fields, - '[a-z]*': wild_any_fields + return api.model("LineMetadata", { + "paragraph_type": fields.String(description="paragraph type (header, list_item, list) and etc.", required=True, example="header"), + "page_id": fields.Integer(description="page number of begin paragraph", required=False, example=0), + "line_id": fields.Integer(description="line number of begin paragraph", required=True, example=13), + "_*": wild_forbid_fields, # don't get private fields + "tag_hierarchy_level": wild_forbid_fields, + "hierarchy_level": wild_forbid_fields, + "[a-z]*": wild_any_fields }) diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py index c73626dc..74321548 100644 --- a/dedoc/data_structures/line_with_meta.py +++ b/dedoc/data_structures/line_with_meta.py @@ -1,5 +1,5 @@ import re -from typing import List, Union, Sized +from typing import List, Sized, Union from uuid import uuid1 from dedoc.data_structures.annotation import Annotation diff --git a/dedoc/data_structures/parsed_document.py b/dedoc/data_structures/parsed_document.py index 81cd1bf2..9483ecb1 100644 --- a/dedoc/data_structures/parsed_document.py +++ b/dedoc/data_structures/parsed_document.py @@ -1,7 +1,7 @@ from collections import OrderedDict from typing import List, Optional -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields import dedoc from dedoc.data_structures.document_content import DocumentContent @@ -43,20 +43,19 @@ def to_dict(self, depth: int = 0) -> dict: res["warnings"] = self.warnings res["content"] = self.content.to_dict() if self.content is not None else [] res["metadata"] = self.metadata.to_dict() - res["attachments"] = [attachment.to_dict(depth=depth + 1) for attachment in self.attachments] \ - if self.attachments is not None and depth < 10 else [] + res["attachments"] = [attachment.to_dict(depth=depth + 1) for attachment in self.attachments] if self.attachments is not None and depth < 10 else [] return res @staticmethod - def get_api_dict(api: Api, depth: int = 0, name: str = 'ParsedDocument') -> Model: + def get_api_dict(api: Api, depth: int = 0, name: str = "ParsedDocument") -> Model: return api.model(name, { - 'content': fields.Nested(DocumentContent.get_api_dict(api), description='Document content structure'), - 'metadata': fields.Nested(DocumentMetadata.get_api_dict(api), allow_null=False, skip_none=True, description='Document meta information'), - 'version': fields.String(description='the version of the program that parsed this document', example="0.9.1"), - 'warnings': fields.List(fields.String(description='list of warnings and possible errors', example="DOCX: seems that document corrupted")), - 'attachments': fields.List(fields.Nested(api.model('others_ParsedDocument', {})), description='structure of attachments', required=False) + "content": fields.Nested(DocumentContent.get_api_dict(api), description="Document content structure"), + "metadata": fields.Nested(DocumentMetadata.get_api_dict(api), allow_null=False, skip_none=True, description="Document meta information"), + "version": fields.String(description="the version of the program that parsed this document", example="0.9.1"), + "warnings": fields.List(fields.String(description="list of warnings and possible errors", example="DOCX: seems that document corrupted")), + "attachments": fields.List(fields.Nested(api.model("others_ParsedDocument", {})), description="structure of attachments", required=False) if depth == 10 # TODO delete this - else fields.List(fields.Nested(ParsedDocument.get_api_dict(api, depth=depth + 1, name='refParsedDocument' + str(depth)), - description='Attachment structure', + else fields.List(fields.Nested(ParsedDocument.get_api_dict(api, depth=depth + 1, name="refParsedDocument" + str(depth)), + description="Attachment structure", required=False))}) diff --git a/dedoc/data_structures/table.py b/dedoc/data_structures/table.py index 97922ac7..c9d45b68 100644 --- a/dedoc/data_structures/table.py +++ b/dedoc/data_structures/table.py @@ -1,6 +1,7 @@ from collections import OrderedDict -from typing import List, Optional, Any -from flask_restx import fields, Api, Model +from typing import Any, List, Optional + +from flask_restx import Api, Model, fields from dedoc.data_structures.cell_property import CellProperty from dedoc.data_structures.serializable import Serializable @@ -31,7 +32,7 @@ def to_dict(self) -> dict: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('Table', { - 'cells': fields.List(fields.List(fields.String(description="Cell contains text")), description="matrix of cells"), - 'metadata': fields.Nested(TableMetadata.get_api_dict(api), readonly=True, description='Table meta information') + return api.model("Table", { + "cells": fields.List(fields.List(fields.String(description="Cell contains text")), description="matrix of cells"), + "metadata": fields.Nested(TableMetadata.get_api_dict(api), readonly=True, description="Table meta information") }) diff --git a/dedoc/data_structures/table_metadata.py b/dedoc/data_structures/table_metadata.py index 86d3f63a..be5c3fe0 100644 --- a/dedoc/data_structures/table_metadata.py +++ b/dedoc/data_structures/table_metadata.py @@ -1,9 +1,10 @@ from collections import OrderedDict -from typing import Optional, List -from flask_restx import fields, Api, Model +from typing import List, Optional + +from flask_restx import Api, Model, fields -from dedoc.data_structures.serializable import Serializable from dedoc.data_structures.cell_property import CellProperty +from dedoc.data_structures.serializable import Serializable class TableMetadata(Serializable): @@ -28,17 +29,16 @@ def to_dict(self) -> dict: res["uid"] = self.uid res["page_id"] = self.page_id res["is_inserted"] = self.is_inserted - res["cell_properties"] = [[cell_prop.to_dict() for cell_prop in row_prop] - for row_prop in self.cell_properties] if self.cell_properties else None + res["cell_properties"] = [[cell_prop.to_dict() for cell_prop in row_prop] for row_prop in self.cell_properties] if self.cell_properties else None return res @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('TableMetadata', { - 'page_id': fields.Integer(readonly=False, description='table start page number'), - 'uid': fields.String(description="table unique id"), - 'is_inserted': fields.Boolean(description="was the table inserted into document body"), - 'cell_properties': fields.List(fields.List(fields.Nested(CellProperty.get_api_dict(api), + return api.model("TableMetadata", { + "page_id": fields.Integer(readonly=False, description="table start page number"), + "uid": fields.String(description="table unique id"), + "is_inserted": fields.Boolean(description="was the table inserted into document body"), + "cell_properties": fields.List(fields.List(fields.Nested(CellProperty.get_api_dict(api), description="cell properties, colspan, rowspan, etc", allow_null=True, skip_none=True))) diff --git a/dedoc/data_structures/tree_node.py b/dedoc/data_structures/tree_node.py index 3884d8fa..454e5059 100644 --- a/dedoc/data_structures/tree_node.py +++ b/dedoc/data_structures/tree_node.py @@ -1,14 +1,14 @@ from collections import OrderedDict from typing import List, Optional -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation -from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.serializable import Serializable from dedoc.utils.annotation_merger import AnnotationMerger -from dedoc.data_structures.hierarchy_level import HierarchyLevel class TreeNode(Serializable): @@ -48,24 +48,21 @@ def to_dict(self) -> dict: return res @staticmethod - def get_api_dict(api: Api, depth: int = 0, name: str = 'TreeNode') -> Model: + def get_api_dict(api: Api, depth: int = 0, name: str = "TreeNode") -> Model: return api.model(name, { - 'node_id': fields.String(description="Document element identifier. It is unique within one tree (i.e. " + "node_id": fields.String(description="Document element identifier. It is unique within one tree (i.e. " "there will be no other such node_id in this tree, but in attachment " "it may occur) The identifier has the form 0.2.1 where each number " "means a serial number at the corresponding level of the hierarchy.", required=True, - example="0.2.1" - ), - 'text': fields.String(description="text of node", required=True, example="Закон"), - 'annotations': fields.List(fields.Nested(Annotation.get_api_dict(api), - description="Text annotations (font, size, bold, italic and etc)")), - 'metadata': fields.Nested(LineMetadata.get_api_dict(api), skip_none=True, allow_null=False, description="Line meta information"), - 'subparagraphs': fields.List(fields.Nested(api.model('others_TreeNode', {})), - description="Node childes (with type 'TreeNode') of structure tree") + example="0.2.1"), + "text": fields.String(description="text of node", required=True, example="Закон"), + "annotations": fields.List(fields.Nested(Annotation.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")), + "metadata": fields.Nested(LineMetadata.get_api_dict(api), skip_none=True, allow_null=False, description="Line meta information"), + "subparagraphs": fields.List(fields.Nested(api.model("others_TreeNode", {})), description='Node childes (with type "TreeNode") of structure tree') if depth == 30 # TODO delete this - else fields.List(fields.Nested(TreeNode.get_api_dict(api, depth=depth + 1, name='refTreeNode' + str(depth))), - description="Node childes (with type 'TreeNode') of structure tree") + else fields.List(fields.Nested(TreeNode.get_api_dict(api, depth=depth + 1, name="refTreeNode" + str(depth))), + description='Node childes (with type "TreeNode") of structure tree') }) @staticmethod @@ -101,7 +98,7 @@ def add_child(self, line: LineWithMeta) -> "TreeNode": :return: return created node (child of the self) """ new_node = TreeNode( - node_id=self.node_id + ".{}".format(len(self.subparagraphs)), + node_id=f"{self.node_id}.{len(self.subparagraphs)}", text=line.line, annotations=line.annotations, metadata=line.metadata, diff --git a/dedoc/dedoc_manager.py b/dedoc/dedoc_manager.py index 62d7faa7..0079d5c6 100644 --- a/dedoc/dedoc_manager.py +++ b/dedoc/dedoc_manager.py @@ -2,14 +2,14 @@ import os.path import shutil import tempfile -from typing import Optional, Dict +from typing import Dict, Optional -from dedoc.common.exceptions.dedoc_exception import DedocException +from dedoc.common.exceptions.dedoc_error import DedocError from dedoc.config import get_config -from dedoc.manager_config import get_manager_config from dedoc.data_structures import ParsedDocument, UnstructuredDocument +from dedoc.manager_config import get_manager_config from dedoc.metadata_extractors import BaseMetadataExtractor -from dedoc.train_dataset.train_dataset_utils import save_line_with_meta, get_path_original_documents +from dedoc.train_dataset.train_dataset_utils import get_path_original_documents, save_line_with_meta from dedoc.utils.utils import get_unique_name @@ -67,7 +67,7 @@ def parse(self, file_path: str, parameters: Optional[Dict[str, str]] = None) -> try: return self.__parse_no_error_handling(file_path=file_path, parameters=parameters) - except DedocException as e: + except DedocError as e: file_dir, file_name = os.path.split(file_path) e.filename = file_name e.metadata = BaseMetadataExtractor._get_base_meta_information(directory=file_dir, filename=file_name, name_actual=file_name) diff --git a/dedoc/download_models.py b/dedoc/download_models.py index dd511463..9af9abd9 100644 --- a/dedoc/download_models.py +++ b/dedoc/download_models.py @@ -6,7 +6,6 @@ from dedoc.config import get_config - """ These are versions of the models that are used at the current moment - hashes of commits from https://huggingface.co/dedoc. Keys are the names of repositories with models. diff --git a/dedoc/extensions.py b/dedoc/extensions.py index a21fd585..3e8d326a 100644 --- a/dedoc/extensions.py +++ b/dedoc/extensions.py @@ -3,38 +3,38 @@ from dedoc.utils.utils import get_extensions_by_mimes -Extensions = namedtuple('Parts', 'excel_like_format ' - 'pptx_like_format ' - 'csv_like_format ' - 'docx_like_format ' - 'archive_like_format ' - 'image_like_format ' - 'pdf_like_format ' - 'txt_like_format') +Extensions = namedtuple("Parts", "excel_like_format " + "pptx_like_format " + "csv_like_format " + "docx_like_format " + "archive_like_format " + "image_like_format " + "pdf_like_format " + "txt_like_format") converted_extensions = Extensions( - excel_like_format=['.ods', 'xls'], - docx_like_format=['.odt', '.doc'], - pptx_like_format=['.odp', '.ppt'], + excel_like_format=[".ods", "xls"], + docx_like_format=[".odt", ".doc"], + pptx_like_format=[".odp", ".ppt"], archive_like_format=[], - image_like_format=['.pcx', '.webp', '.sgi', '.hdr', '.sr', '.pic', '.dib', '.jfif', '.j2k'], + image_like_format=[".pcx", ".webp", ".sgi", ".hdr", ".sr", ".pic", ".dib", ".jfif", ".j2k"], pdf_like_format=[], csv_like_format=[], - txt_like_format=['.xml'] + txt_like_format=[".xml"] ) # .sgi, .hdr, .sr, .ras - не зарегистрованы в mime converted_mimes = Extensions( excel_like_format=["application/vnd.oasis.opendocument.spreadsheet", "application/vnd.ms-excel"], - docx_like_format=['application/msword', "application/vnd.oasis.opendocument.text"], - pptx_like_format=['application/vnd.openxmlformats-officedocument.presentationml.presentation', - 'application/vnd.ms-powerpoint', 'application/vnd.oasis.opendocument.presentation'], + docx_like_format=["application/msword", "application/vnd.oasis.opendocument.text"], + pptx_like_format=["application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.ms-powerpoint", "application/vnd.oasis.opendocument.presentation"], archive_like_format=[], - image_like_format=['image/gif', - 'image/x-portable-pixmap', 'image/x-portable-anymap', 'image/x-portable-graymap', - 'image/x-portable-bitmap', 'image/x-pcx', 'image/x-pict', - 'application/postscript', 'image/x-cmu-raster'], - pdf_like_format=['image/vnd.djvu'], + image_like_format=["image/gif", + "image/x-portable-pixmap", "image/x-portable-anymap", "image/x-portable-graymap", + "image/x-portable-bitmap", "image/x-pcx", "image/x-pict", + "application/postscript", "image/x-cmu-raster"], + pdf_like_format=["image/vnd.djvu"], csv_like_format=[], txt_like_format=["application/xml", "text/xml"] ) @@ -51,14 +51,14 @@ ) recognized_mimes = Extensions( - excel_like_format=['application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.ms-excel'], - docx_like_format=['application/vnd.openxmlformats-officedocument.wordprocessingml.document'], - pptx_like_format=['application/vnd.openxmlformats-officedocument.presentationml.presentation'], - archive_like_format=['application/zip', 'application/x-tar', 'application/x-rar-compressed', 'application/rar', 'application/x-7z-compressed'], - image_like_format=['image/jpeg', 'image/png', 'image/tiff', 'image/x-ms-bmp', 'image/bmp'], - pdf_like_format=['application/pdf'], + excel_like_format=["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.ms-excel"], + docx_like_format=["application/vnd.openxmlformats-officedocument.wordprocessingml.document"], + pptx_like_format=["application/vnd.openxmlformats-officedocument.presentationml.presentation"], + archive_like_format=["application/zip", "application/x-tar", "application/x-rar-compressed", "application/rar", "application/x-7z-compressed"], + image_like_format=["image/jpeg", "image/png", "image/tiff", "image/x-ms-bmp", "image/bmp"], + pdf_like_format=["application/pdf"], csv_like_format=[], - txt_like_format=['text/plain', 'text/html'] + txt_like_format=["text/plain", "text/html"] ) diff --git a/dedoc/main.py b/dedoc/main.py index b784fad0..e80ef2e4 100644 --- a/dedoc/main.py +++ b/dedoc/main.py @@ -1,7 +1,7 @@ import argparse +from dedoc.api.dedoc_api import get_api, run_api # noqa from dedoc.config import Configuration, get_config -from dedoc.api.dedoc_api import run_api, get_api # noqa def main() -> None: @@ -12,11 +12,11 @@ def main() -> None: parser_config = argparse.ArgumentParser() parser_config.add_argument("-c", "--config_path", help="path to configuration file") parser_config.add_argument("-m", "--module", help="Only for tests") - parser_config.add_argument("-f", "--test_files", metavar="VALUE", nargs='*', help="Only for tests") - parser_config.add_argument('-v', "--unitest_verbose_mode", nargs='?', help="to enable verbose mode of unittest. Only for tests") + parser_config.add_argument("-f", "--test_files", metavar="VALUE", nargs="*", help="Only for tests") + parser_config.add_argument("-v", "--unitest_verbose_mode", nargs="?", help="to enable verbose mode of unittest. Only for tests") args_config = parser_config.parse_args() - Configuration.getInstance().getConfig(args_config) + Configuration.get_instance().get_config(args_config) config = get_config() if config.get("labeling_mode", False): diff --git a/dedoc/manager_config.py b/dedoc/manager_config.py index f2cd01c7..b7993f53 100644 --- a/dedoc/manager_config.py +++ b/dedoc/manager_config.py @@ -1,3 +1,5 @@ +from typing import Optional + from dedoc.attachments_handler.attachments_handler import AttachmentsHandler from dedoc.converters.concrete_converters.binary_converter import BinaryConverter from dedoc.converters.concrete_converters.docx_converter import DocxConverter @@ -114,7 +116,7 @@ class ConfigurationManager(object): __config = None @classmethod - def getInstance(cls: "ConfigurationManager") -> "ConfigurationManager": + def get_instance(cls: "ConfigurationManager") -> "ConfigurationManager": """ Actual object creation will happen when we use ConfigurationManager.getInstance() """ @@ -123,17 +125,17 @@ def getInstance(cls: "ConfigurationManager") -> "ConfigurationManager": return cls.__instance - def initConfig(self, config: dict, new_config: dict = None) -> None: + def init_config(self, config: dict, new_config: Optional[dict] = None) -> None: if new_config is None: self.__config = _get_manager_config(config) else: self.__config = new_config - def getConfig(self, config: dict) -> dict: + def get_config(self, config: dict) -> dict: if self.__config is None: - self.initConfig(config) + self.init_config(config) return self.__config def get_manager_config(config: dict) -> dict: - return ConfigurationManager().getInstance().getConfig(config) + return ConfigurationManager().get_instance().get_config(config) diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py index b1e2399e..5d37ad61 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py @@ -4,7 +4,7 @@ from typing import Optional, Union import piexif -from PIL import Image, ExifTags +from PIL import ExifTags, Image from dateutil import parser from dedoc.data_structures.unstructured_document import UnstructuredDocument diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py index 14839416..2708e5e6 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py @@ -2,7 +2,7 @@ import pickle from typing import Optional -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException +from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor @@ -46,20 +46,20 @@ def add_metadata(self, try: file_path = os.path.join(directory, filename) - with open(file_path, 'rb') as infile: + with open(file_path, "rb") as infile: note_dict = pickle.load(infile) - fields = {"author": note_dict['author']} + fields = {"author": note_dict["author"]} other_fields = {**other_fields, **fields} if other_fields is not None else fields meta_info = dict(file_name=original_filename, file_type="note", - size=note_dict['size'], - access_time=note_dict['modified_time'], - created_time=note_dict['created_time'], - modified_time=note_dict['modified_time'], + size=note_dict["size"], + access_time=note_dict["modified_time"], + created_time=note_dict["created_time"], + modified_time=note_dict["modified_time"], other_fields=other_fields) document.metadata = meta_info return document except Exception: - raise BadFileFormatException(f"Bad note file:\n file_name = {os.path.basename(filename)}. Seems note-format is broken") + raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(filename)}. Seems note-format is broken") diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py index 74660e9e..96682fc0 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py @@ -90,7 +90,7 @@ def _get_pdf_info(self, path: str) -> dict: except PdfReadError: return {"broken_pdf": True} except Exception as e: - self.logger.warning("exception while extract pdf metadata: {} {}".format(path, e)) + self.logger.warning(f"exception while extract pdf metadata: {path} {e}") if self.config.get("debug_mode", False): raise e return {"broken_pdf": True} diff --git a/dedoc/readers/archive_reader/archive_reader.py b/dedoc/readers/archive_reader/archive_reader.py index e493aff2..80b73adc 100644 --- a/dedoc/readers/archive_reader/archive_reader.py +++ b/dedoc/readers/archive_reader/archive_reader.py @@ -4,12 +4,12 @@ import uuid import zipfile import zlib -from typing import List, Optional, IO, Iterator +from typing import IO, Iterator, List, Optional import py7zlib import rarfile -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException +from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.attached_file import AttachedFile from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes @@ -47,30 +47,30 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio def __get_attachments(self, path: str) -> List[AttachedFile]: tmp_dir = os.path.dirname(path) mime = get_file_mime_type(path) - if zipfile.is_zipfile(path) and mime == 'application/zip': + if zipfile.is_zipfile(path) and mime == "application/zip": return list(self.__read_zip_archive(path=path, tmp_dir=tmp_dir)) if tarfile.is_tarfile(path): return list(self.__read_tar_archive(path=path, tmp_dir=tmp_dir)) if rarfile.is_rarfile(path): return list(self.__read_rar_archive(path=path, tmp_dir=tmp_dir)) - if mime == 'application/x-7z-compressed': + if mime == "application/x-7z-compressed": return list(self.__read_7z_archive(path=path, tmp_dir=tmp_dir)) # if no one can handle this archive raise exception - raise BadFileFormatException("bad archive {}".format(path)) + raise BadFileFormatError(f"bad archive {path}") def __read_zip_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]: try: - with zipfile.ZipFile(path, 'r') as arch_file: + with zipfile.ZipFile(path, "r") as arch_file: names = [member.filename for member in arch_file.infolist() if member.file_size > 0] for name in names: with arch_file.open(name) as file: yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file) except (zipfile.BadZipFile, zlib.error) as e: - self.logger.warning("Can't read file {} ({})".format(path, e)) - raise BadFileFormatException("Can't read file {} ({})".format(path, e)) + self.logger.warning(f"Can't read file {path} ({e})") + raise BadFileFormatError(f"Can't read file {path} ({e})") def __read_tar_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]: - with tarfile.open(path, 'r') as arch_file: + with tarfile.open(path, "r") as arch_file: names = [member.name for member in arch_file.getmembers() if member.isfile()] for name in names: file = arch_file.extractfile(name) @@ -78,7 +78,7 @@ def __read_tar_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]: file.close() def __read_rar_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]: - with rarfile.RarFile(path, 'r') as arch_file: + with rarfile.RarFile(path, "r") as arch_file: names = [item.filename for item in arch_file.infolist() if item.compress_size > 0] for name in names: with arch_file.open(name) as file: @@ -102,6 +102,6 @@ def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes]) -> original_name=file_name, tmp_file_path=os.path.join(tmp_dir, tmp_path), need_content_analysis=True, - uid="attach_{}".format(uuid.uuid1()) + uid=f"attach_{uuid.uuid1()}" ) return attachment diff --git a/dedoc/readers/csv_reader/csv_reader.py b/dedoc/readers/csv_reader/csv_reader.py index 872b9e23..9b639506 100644 --- a/dedoc/readers/csv_reader/csv_reader.py +++ b/dedoc/readers/csv_reader/csv_reader.py @@ -1,5 +1,5 @@ import csv -from typing import Optional, Tuple, List +from typing import List, Optional, Tuple from dedoc.data_structures.table import Table from dedoc.data_structures.table_metadata import TableMetadata @@ -38,7 +38,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio data = list(csv_reader) table_metadata = TableMetadata(page_id=0) tables = [Table(cells=data, metadata=table_metadata)] - warnings = ["delimiter is '{}'".format(delimiter)] + warnings = [f"delimiter is '{delimiter}'"] warnings.extend(encoding_warning) return UnstructuredDocument(lines=[], tables=tables, attachments=[], warnings=warnings) diff --git a/dedoc/readers/docx_reader/data_structures/base_props.py b/dedoc/readers/docx_reader/data_structures/base_props.py index 841e183a..c439c3d0 100644 --- a/dedoc/readers/docx_reader/data_structures/base_props.py +++ b/dedoc/readers/docx_reader/data_structures/base_props.py @@ -8,7 +8,7 @@ def __init__(self, properties: Optional["BaseProperties"] = None) -> None: # no Contains style properties for paragraphs and runs. :param properties: Paragraph or Run for copying its properties """ - self.jc = properties.jc if properties else 'left' + self.jc = properties.jc if properties else "left" self.indentation = properties.indentation if properties and properties.indentation else 0 self.size = properties.size if properties else 0 self.bold = properties.bold if properties else False diff --git a/dedoc/readers/docx_reader/data_structures/docx_document.py b/dedoc/readers/docx_reader/data_structures/docx_document.py index 901f4750..41190ede 100644 --- a/dedoc/readers/docx_reader/data_structures/docx_document.py +++ b/dedoc/readers/docx_reader/data_structures/docx_document.py @@ -4,20 +4,20 @@ import re import zipfile from collections import defaultdict -from typing import Optional, List +from typing import List, Optional from bs4 import BeautifulSoup, Tag -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException +from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.attached_file import AttachedFile from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph from dedoc.readers.docx_reader.data_structures.table import DocxTable from dedoc.readers.docx_reader.data_structures.utils import Counter from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor +from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor from dedoc.readers.docx_reader.styles_extractor import StylesExtractor from dedoc.utils.utils import calculate_file_hash @@ -30,15 +30,15 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L self.path_hash = calculate_file_hash(path=path) self.attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments} - self.document_bs_tree = self.__get_bs_tree('word/document.xml') + self.document_bs_tree = self.__get_bs_tree("word/document.xml") if self.document_bs_tree is None: - self.document_bs_tree = self.__get_bs_tree('word/document2.xml') + self.document_bs_tree = self.__get_bs_tree("word/document2.xml") self.body = self.document_bs_tree.body if self.document_bs_tree else None - self.footnote_extractor = FootnoteExtractor(self.__get_bs_tree('word/footnotes.xml')) - self.endnote_extractor = FootnoteExtractor(self.__get_bs_tree('word/endnotes.xml'), key="endnote") - self.styles_extractor = StylesExtractor(self.__get_bs_tree('word/styles.xml'), logger) - num_tree = self.__get_bs_tree('word/numbering.xml') + self.footnote_extractor = FootnoteExtractor(self.__get_bs_tree("word/footnotes.xml")) + self.endnote_extractor = FootnoteExtractor(self.__get_bs_tree("word/endnotes.xml"), key="endnote") + self.styles_extractor = StylesExtractor(self.__get_bs_tree("word/styles.xml"), logger) + num_tree = self.__get_bs_tree("word/numbering.xml") self.numbering_extractor = NumberingExtractor(num_tree, self.styles_extractor) if num_tree else None self.styles_extractor.numbering_extractor = self.numbering_extractor @@ -65,7 +65,7 @@ def __get_lines(self, logger: logging.Logger) -> List[LineWithMeta]: if not isinstance(paragraph_xml, Tag): continue - if paragraph_xml.name == 'tbl': + if paragraph_xml.name == "tbl": self.__handle_table_xml(paragraph_xml, table_refs, uids_set, cnt) continue @@ -73,14 +73,14 @@ def __get_lines(self, logger: logging.Logger) -> List[LineWithMeta]: self.__handle_diagram_xml(paragraph_xml, diagram_refs, uids_set, cnt) continue - if paragraph_xml.name != 'p': - for subparagraph_xml in paragraph_xml.find_all('w:p'): # TODO check what to add + if paragraph_xml.name != "p": + for subparagraph_xml in paragraph_xml.find_all("w:p"): # TODO check what to add paragraph = self.__xml2paragraph(subparagraph_xml, uids_set, cnt) self.paragraph_list.append(paragraph) continue self.paragraph_list.append(self.__xml2paragraph(paragraph_xml, uids_set, cnt)) - images = paragraph_xml.find_all('pic:pic') + images = paragraph_xml.find_all("pic:pic") if images: self.__handle_images_xml(images, image_refs, uids_set, cnt) @@ -124,12 +124,12 @@ def __get_bs_tree(self, filename: str) -> Optional[BeautifulSoup]: with zipfile.ZipFile(self.path) as document: content = document.read(filename) content = re.sub(br"\n[\t ]*", b"", content) - soup = BeautifulSoup(content, 'xml') + soup = BeautifulSoup(content, "xml") return soup except KeyError: return None except zipfile.BadZipFile: - raise BadFileFormatException("Bad docx file:\n file_name = {}. Seems docx is broken".format(os.path.basename(self.path))) + raise BadFileFormatError(f"Bad docx file:\n file_name = {os.path.basename(self.path)}. Seems docx is broken") def __xml2paragraph(self, paragraph_xml: Tag, uids_set: set, cnt: Counter) -> Paragraph: uid = self.__get_paragraph_uid(paragraph_xml=paragraph_xml, uids_set=uids_set) @@ -146,12 +146,12 @@ def __xml2paragraph(self, paragraph_xml: Tag, uids_set: set, cnt: Counter) -> Pa def __get_paragraph_uid(self, paragraph_xml: Tag, uids_set: set) -> str: xml_hash = hashlib.md5(paragraph_xml.encode()).hexdigest() - raw_uid = '{}_{}'.format(self.path_hash, xml_hash) + raw_uid = f"{self.path_hash}_{xml_hash}" uid = raw_uid n = 0 while uid in uids_set: n += 1 - uid = raw_uid + "_{}".format(n) + uid = f"{raw_uid}_{n}" uids_set.add(uid) return uid @@ -168,13 +168,13 @@ def __handle_table_xml(self, xml: Tag, table_refs: dict, uids_set: set, cnt: Cou table_refs[len(self.paragraph_list) - 1].append(table_uid) def __handle_images_xml(self, xmls: List[Tag], image_refs: dict, uids_set: set, cnt: Counter) -> None: - rels = self.__get_bs_tree('word/_rels/document.xml.rels') + rels = self.__get_bs_tree("word/_rels/document.xml.rels") if rels is None: - rels = self.__get_bs_tree('word/_rels/document2.xml.rels') + rels = self.__get_bs_tree("word/_rels/document2.xml.rels") images_rels = dict() - for rel in rels.find_all('Relationship'): - if rel["Target"].startswith('media/'): + for rel in rels.find_all("Relationship"): + if rel["Target"].startswith("media/"): images_rels[rel["Id"]] = rel["Target"][6:] self.__prepare_paragraph_list(uids_set, cnt) @@ -208,5 +208,5 @@ def __prepare_paragraph_list(self, uids_set: set, cnt: Counter) -> None: break if not self.paragraph_list: - empty_paragraph = self.__xml2paragraph(BeautifulSoup('page_id {}
line_id {}
text {}
".format( - page_id, line_id, text), + labeled=[line["_metadata"]["hierarchy_level"]["line_type"]], + additional_info=f"page_id {page_id}
line_id {line_id}
text {text}
", default_label=self.item2label(line) ) task_items.append(task_item) diff --git a/dedoc/train_dataset/taskers/concrete_taskers/table_tasker.py b/dedoc/train_dataset/taskers/concrete_taskers/table_tasker.py index df11a4cc..cbad1231 100644 --- a/dedoc/train_dataset/taskers/concrete_taskers/table_tasker.py +++ b/dedoc/train_dataset/taskers/concrete_taskers/table_tasker.py @@ -10,8 +10,8 @@ from dedoc.train_dataset.data_path_config import table_path from dedoc.train_dataset.data_structures.task_item import TaskItem from dedoc.train_dataset.taskers.concrete_taskers.abstract_tasker import AbstractTasker -from dedoc.utils.utils import get_batch from dedoc.utils.image_utils import draw_rectangle +from dedoc.utils.utils import get_batch class File: @@ -37,15 +37,13 @@ def create_tasks(self, task_size: int, tasks_uid: str) -> Iterable[str]: files = self._get_files() with tempfile.TemporaryDirectory() as tmp_dir: for i, batch in enumerate(get_batch(task_size, files)): - task_directory = "task_{:03d}".format(i) - archive_path = "/tmp/{}.zip".format(task_directory) - image_directory = "{}/images".format(task_directory) + task_directory = f"task_{i:03d}" + archive_path = f"/tmp/{task_directory}.zip" + image_directory = f"{task_directory}/images" with ZipFile(archive_path, "a") as task_archive: self.__add_task(archive=task_archive, files=batch, task_directory=task_directory) dockerfile_directory = os.path.join(self.resources_path, "train_dataset/img_classifier_dockerfile") - self._add_docker_files(archive=task_archive, - task_directory=task_directory, - dockerfile_directory=dockerfile_directory) + self._add_docker_files(archive=task_archive, task_directory=task_directory, dockerfile_directory=dockerfile_directory) self._add_config(task_archive=task_archive, task_name=task_directory, task_directory=task_directory, @@ -58,15 +56,14 @@ def __add_task(self, archive: ZipFile, files: List[File], task_directory: str) - task_items = {} for task_id, file in enumerate(files): data = file.data - data["original_document"] = "{}.png".format(file.name) + data["original_document"] = f"{file.name}.png" task_items[task_id] = TaskItem(task_id=task_id, - task_path="images/{}".format(os.path.basename(file.image_path)), + task_path=f"images/{os.path.basename(file.image_path)}", labeled=None, data=data, additional_info="", default_label="table").to_dict() - archive.writestr("{}/tasks.json".format(task_directory), - json.dumps(task_items, ensure_ascii=False, indent=4).encode("utf-8")) + archive.writestr(f"{task_directory}/tasks.json", json.dumps(task_items, ensure_ascii=False, indent=4).encode("utf-8")) def get_original_documents(self) -> str: archive_path = "/tmp/original_documents.zip" @@ -77,8 +74,8 @@ def _get_files(self) -> List[File]: files = {file.split(".")[0] for file in os.listdir(table_path)} result = [] for file_name in sorted(files): - image_path = os.path.join(table_path, "{}.png".format(file_name)) - json_path = os.path.join(table_path, "{}.json".format(file_name)) + image_path = os.path.join(table_path, f"{file_name}.png") + json_path = os.path.join(table_path, f"{file_name}.json") file = File(image_path=image_path, json_path=json_path) result.append(file) return result @@ -104,6 +101,6 @@ def __add_images(self, files: List[File], archive: ZipFile, image_directory: str height=bbox.height, color=(255, 0, 0)) image_rectangle = Image.fromarray(image_rectangle) - image_path = os.path.join(tmpdir, "{}.png".format(file.name)) + image_path = os.path.join(tmpdir, f"{file.name}.png") image_rectangle.save(image_path) - archive.write(image_path, "{}/{}.png".format(image_directory, file.name)) + archive.write(image_path, f"{image_directory}/{file.name}.png") diff --git a/dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py b/dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py index 0ba8c788..f6ec7b54 100644 --- a/dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py +++ b/dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py @@ -8,7 +8,7 @@ from collections import defaultdict from collections import namedtuple from copy import deepcopy -from typing import Iterator, Optional, Dict, Iterable, Tuple +from typing import Dict, Iterable, Iterator, Optional, Tuple from typing import List import numpy as np @@ -17,7 +17,7 @@ from bs4 import BeautifulSoup from pdf2image import convert_from_path -from dedoc.common.exceptions.conversion_exception import ConversionException +from dedoc.common.exceptions.conversion_error import ConversionError from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader @@ -36,8 +36,8 @@ def __init__(self, path2docs: str, *, config: dict) -> None: self.first_color = 15 self.base_color = 0 self.base_color_step = 1 - self.many_colors_file_name = 'many_colors_doc' - self.two_colors_file_name = 'two_colors_doc' + self.many_colors_file_name = "many_colors_doc" + self.two_colors_file_name = "two_colors_doc" self.config = config self.logger = self.config.get("logger", logging.getLogger()) @@ -50,7 +50,7 @@ def add_images(self, page: List[dict], archive: zipfile.ZipFile) -> None: nonzero pixels on bboxes only) 3 we clear bboxes from first image 4 and create one image per bbox and save in tmp dir - 5 and finally we return image with bboxes in the proper order + 5 finally we return image with bboxes in the proper order @param page: @param archive: @return: @@ -65,7 +65,7 @@ def add_images(self, page: List[dict], archive: zipfile.ZipFile) -> None: self.logger.info("\nstart image processing") uid_with_images = self._create_images_from_pdf(pdfs=pdfs, page=page, tmp_dir=tmp_dir) for uid, image in uid_with_images: - img_name = "{}.jpg".format(uid) + img_name = f"{uid}.jpg" with tempfile.TemporaryDirectory() as tmpdir: img_path = os.path.join(tmpdir, img_name) image.save(img_path, format="jpeg") @@ -91,7 +91,7 @@ def __create_pair_pdfs(self, docx_archive: zipfile.ZipFile, document: DocxDocume text = re.sub("w:ppr", "w:pPr", text) many_colors_pdf = self.__create_pdf_from_docx(tmp_dir, self.many_colors_file_name, namelist, text) # clear document_bs from border tags - border_tags = document_bs.find_all('w:pbdr') + border_tags = document_bs.find_all("w:pbdr") for tag in border_tags: tag.decompose() # create docx file with bboxes of two interleaving colors @@ -103,9 +103,7 @@ def __create_pair_pdfs(self, docx_archive: zipfile.ZipFile, document: DocxDocume self.logger.info("\nstart image converting") return PairedPdf(many_colors_pdf, two_colors_pdf, used_many_colors, used_two_colors) - def __draw_bboxes(self, - paragraph_list: List[Paragraph], - many_colors: bool) -> Dict[str, int]: + def __draw_bboxes(self, paragraph_list: List[Paragraph], many_colors: bool) -> Dict[str, int]: """ draw bbox in docx document around each paragraph @param paragraph_list: @@ -137,20 +135,17 @@ def __draw_bboxes(self, def _color_from_decimal(decimal_color: int) -> str: color = hex(decimal_color)[2:] if len(color) < 6: - color = '0' * (6 - len(color)) + color + color = "0" * (6 - len(color)) + color return color @staticmethod - def __create_pdf_from_docx(tmp_dir: str, - doc_name: str, - namelist: List[str], - doc_text: str) -> str: - with open('{}/word/document.xml'.format(tmp_dir), 'w') as f: + def __create_pdf_from_docx(tmp_dir: str, doc_name: str, namelist: List[str], doc_text: str) -> str: + with open(f"{tmp_dir}/word/document.xml", "w") as f: f.write(doc_text) - docx_path = "{}/{}.docx".format(tmp_dir, doc_name) - with zipfile.ZipFile(docx_path, mode='w') as new_d: + docx_path = f"{tmp_dir}/{doc_name}.docx" + with zipfile.ZipFile(docx_path, mode="w") as new_d: for filename in namelist: - new_d.write('{}/{}'.format(tmp_dir, filename), arcname=filename) + new_d.write(f"{tmp_dir}/{filename}", arcname=filename) # create pdf file with bbox pdf_name = DocxImagesCreator.__docx2pdf(tmp_dir, docx_path) os.remove(docx_path) @@ -166,36 +161,28 @@ def __await_for_conversion(filename: str) -> None: t += period_checking if t >= timeout: - raise ConversionException( - msg="fail with {filename}".format(filename=filename), - msg_api="Unsupported file format {}".format(filename)) + raise ConversionError(msg=f"fail with {filename}", msg_api=f"Unsupported file format {filename}") @staticmethod - def __docx2pdf(out_dir: str, - path: str) -> str: - os.system("soffice --headless --convert-to pdf {} --outdir {}".format(path, out_dir)) - out_file = '{}/{}pdf'.format(out_dir, os.path.split(path)[-1][:-4]) + def __docx2pdf(out_dir: str, path: str) -> str: + os.system(f"soffice --headless --convert-to pdf {path} --outdir {out_dir}") + out_file = f"{out_dir}/{os.path.split(path)[-1][:-4]}pdf" DocxImagesCreator.__await_for_conversion(out_file) return out_file @staticmethod - def __insert_border(bs_tree: Optional[BeautifulSoup], - color: str) -> None: + def __insert_border(bs_tree: Optional[BeautifulSoup], color: str) -> None: if bs_tree is None: return - border_str = 'id = 0 ; type = root
Пример документа", result)
self.assertTrue("\n"
"\n"
- " " in result)
def test_newline_tree(self) -> None:
@@ -136,7 +135,7 @@ def test_docx_heading_new(self) -> None:
def __check_doc_like(self, result: dict) -> None:
content = result["content"]["structure"]
self.assertEqual("", get_by_tree_path(content, "0")["text"])
- self.assertEqual('Пример документа\nГлава 1\nКакие то определения\nСтатья 1\nОпределим опрделения\nСтатья 2\nДадим пояснения',
+ self.assertEqual("Пример документа\nГлава 1\nКакие то определения\nСтатья 1\nОпределим опрделения\nСтатья 2\nДадим пояснения",
get_by_tree_path(content, "0.0")["text"].strip())
self.assertEqual("1.2.1. Поясним за непонятное", get_by_tree_path(content, "0.1.0")["text"].strip())
self.assertEqual("1.2.2. Поясним за понятное", get_by_tree_path(content, "0.1.1")["text"].strip())
@@ -146,8 +145,7 @@ def __check_doc_like(self, result: dict) -> None:
table1, table2 = result["content"]["tables"]
- self.assertListEqual(["N", "Фамилия", "Имя", "Организация", "Телефон", "Примечания"],
- table1["cells"][0])
+ self.assertListEqual(["N", "Фамилия", "Имя", "Организация", "Телефон", "Примечания"], table1["cells"][0])
self.assertListEqual(["1", "Иванов", "Иван", "ИСП", "8-800", ""], table1["cells"][1])
self.assertListEqual(["Фамилия", "Имя", "Отчество"], table2["cells"][0])
@@ -165,7 +163,7 @@ def __check_doc_like(self, result: dict) -> None:
def __check_doc_like_insert_table(self, result: dict) -> None:
content = result["content"]["structure"]
self.assertEqual("", get_by_tree_path(content, "0")["text"])
- self.assertEqual('Пример документа\nГлава 1\nКакие то определения\nСтатья 1\nОпределим опрделения\nСтатья 2\nДадим пояснения',
+ self.assertEqual("Пример документа\nГлава 1\nКакие то определения\nСтатья 1\nОпределим опрделения\nСтатья 2\nДадим пояснения",
get_by_tree_path(content, "0.0")["text"].strip())
self.assertEqual("1.2.1. Поясним за непонятное", get_by_tree_path(content, "0.1.0")["text"].strip())
self.assertEqual("1.2.2. Поясним за понятное", get_by_tree_path(content, "0.1.1")["text"].strip())
diff --git a/tests/api_tests/test_api_format_docx_annotations.py b/tests/api_tests/test_api_format_docx_annotations.py
index 9969563f..65df827d 100644
--- a/tests/api_tests/test_api_format_docx_annotations.py
+++ b/tests/api_tests/test_api_format_docx_annotations.py
@@ -8,104 +8,104 @@ class TestApiDocxAnnotations(AbstractTestApiDocReader):
def test_example_1(self) -> None:
result = self._send_request("example_1.docx", data={"structure_type": "linear"})
- subparagraphs = result['content']['structure']['subparagraphs']
- annotations = [subparagraph['annotations'] for subparagraph in subparagraphs]
+ subparagraphs = result["content"]["structure"]["subparagraphs"]
+ annotations = [subparagraph["annotations"] for subparagraph in subparagraphs]
# bold, italic, underlined
- self.assertIn({'start': 0, 'end': 11, 'name': 'style', 'value': 'Body'}, annotations[0])
- self.assertIn({'start': 0, 'end': 12, 'name': 'italic', 'value': 'True'}, annotations[1])
- self.assertIn({'start': 0, 'end': 10, 'name': 'bold', 'value': 'True'}, annotations[2])
- self.assertIn({'start': 0, 'end': 16, 'name': 'underlined', 'value': 'True'}, annotations[3])
- self.assertIn({'start': 0, 'end': 6, 'name': 'italic', 'value': 'True'}, annotations[4])
- self.assertIn({'start': 8, 'end': 13, 'name': 'bold', 'value': 'True'}, annotations[5])
- self.assertIn({'start': 0, 'end': 20, 'name': 'bold', 'value': 'True'}, annotations[6])
- self.assertIn({'start': 5, 'end': 20, 'name': 'underlined', 'value': 'True'}, annotations[6])
+ self.assertIn({"start": 0, "end": 11, "name": "style", "value": "Body"}, annotations[0])
+ self.assertIn({"start": 0, "end": 12, "name": "italic", "value": "True"}, annotations[1])
+ self.assertIn({"start": 0, "end": 10, "name": "bold", "value": "True"}, annotations[2])
+ self.assertIn({"start": 0, "end": 16, "name": "underlined", "value": "True"}, annotations[3])
+ self.assertIn({"start": 0, "end": 6, "name": "italic", "value": "True"}, annotations[4])
+ self.assertIn({"start": 8, "end": 13, "name": "bold", "value": "True"}, annotations[5])
+ self.assertIn({"start": 0, "end": 20, "name": "bold", "value": "True"}, annotations[6])
+ self.assertIn({"start": 5, "end": 20, "name": "underlined", "value": "True"}, annotations[6])
# alignment
- self.assertIn({'start': 0, 'end': 10, 'name': 'alignment', 'value': 'left'}, annotations[8])
- self.assertIn({'start': 0, 'end': 14, 'name': 'alignment', 'value': 'center'}, annotations[9])
- self.assertIn({'start': 0, 'end': 11, 'name': 'alignment', 'value': 'right'}, annotations[10])
- self.assertIn({'start': 0, 'end': 29, 'name': 'alignment', 'value': 'both'}, annotations[11])
+ self.assertIn({"start": 0, "end": 10, "name": "alignment", "value": "left"}, annotations[8])
+ self.assertIn({"start": 0, "end": 14, "name": "alignment", "value": "center"}, annotations[9])
+ self.assertIn({"start": 0, "end": 11, "name": "alignment", "value": "right"}, annotations[10])
+ self.assertIn({"start": 0, "end": 29, "name": "alignment", "value": "both"}, annotations[11])
# indent
- self.assertIn({'start': 0, 'end': 12, 'name': 'indentation', 'value': '0'}, annotations[12])
- self.assertIn({'start': 0, 'end': 11, 'name': 'indentation', 'value': '720.0'}, annotations[13])
- self.assertIn({'start': 0, 'end': 12, 'name': 'indentation', 'value': '1440.0'}, annotations[14])
+ self.assertIn({"start": 0, "end": 12, "name": "indentation", "value": "0"}, annotations[12])
+ self.assertIn({"start": 0, "end": 11, "name": "indentation", "value": "720.0"}, annotations[13])
+ self.assertIn({"start": 0, "end": 12, "name": "indentation", "value": "1440.0"}, annotations[14])
# strike
- self.assertIn({'start': 0, 'end': 11, 'name': 'strike', 'value': 'True'}, annotations[15])
+ self.assertIn({"start": 0, "end": 11, "name": "strike", "value": "True"}, annotations[15])
def test_example_2(self) -> None:
result = self._send_request("example_2.docx", data={"structure_type": "linear"})
- subparagraphs = result['content']['structure']['subparagraphs']
- annotations = [subparagraph['annotations'] for subparagraph in subparagraphs]
+ subparagraphs = result["content"]["structure"]["subparagraphs"]
+ annotations = [subparagraph["annotations"] for subparagraph in subparagraphs]
# heading, italic, bold, underlined
- self.assertIn({'start': 0, 'end': 31, 'name': 'italic', 'value': 'True'}, annotations[3])
- self.assertIn({'start': 0, 'end': 31, 'name': 'style', 'value': 'heading 4'}, annotations[3])
- self.assertIn({'start': 0, 'end': 29, 'name': 'italic', 'value': 'True'}, annotations[8])
- self.assertIn({'start': 0, 'end': 29, 'name': 'style', 'value': 'heading 9'}, annotations[8])
- self.assertIn({'start': 66, 'end': 73, 'name': 'italic', 'value': 'True'}, annotations[35])
- self.assertIn({'start': 75, 'end': 89, 'name': 'bold', 'value': 'True'}, annotations[35])
- self.assertIn({'start': 91, 'end': 111, 'name': 'underlined', 'value': 'True'}, annotations[35])
- self.assertIn({'start': 0, 'end': 153, 'name': 'size', 'value': '14.0'}, annotations[35])
- self.assertIn({'start': 153, 'end': 175, 'name': 'size', 'value': '20.0'}, annotations[35])
- self.assertIn({'start': 183, 'end': 199, 'name': 'size', 'value': '11.0'}, annotations[35])
+ self.assertIn({"start": 0, "end": 31, "name": "italic", "value": "True"}, annotations[3])
+ self.assertIn({"start": 0, "end": 31, "name": "style", "value": "heading 4"}, annotations[3])
+ self.assertIn({"start": 0, "end": 29, "name": "italic", "value": "True"}, annotations[8])
+ self.assertIn({"start": 0, "end": 29, "name": "style", "value": "heading 9"}, annotations[8])
+ self.assertIn({"start": 66, "end": 73, "name": "italic", "value": "True"}, annotations[35])
+ self.assertIn({"start": 75, "end": 89, "name": "bold", "value": "True"}, annotations[35])
+ self.assertIn({"start": 91, "end": 111, "name": "underlined", "value": "True"}, annotations[35])
+ self.assertIn({"start": 0, "end": 153, "name": "size", "value": "14.0"}, annotations[35])
+ self.assertIn({"start": 153, "end": 175, "name": "size", "value": "20.0"}, annotations[35])
+ self.assertIn({"start": 183, "end": 199, "name": "size", "value": "11.0"}, annotations[35])
# alignment
- self.assertIn({'start': 0, 'end': 46, 'name': 'alignment', 'value': 'right'}, annotations[43])
- self.assertIn({'start': 0, 'end': 40, 'name': 'alignment', 'value': 'center'}, annotations[44])
- self.assertIn({'start': 0, 'end': 160, 'name': 'alignment', 'value': 'both'}, annotations[45])
+ self.assertIn({"start": 0, "end": 46, "name": "alignment", "value": "right"}, annotations[43])
+ self.assertIn({"start": 0, "end": 40, "name": "alignment", "value": "center"}, annotations[44])
+ self.assertIn({"start": 0, "end": 160, "name": "alignment", "value": "both"}, annotations[45])
# bold, italic, underlined
- self.assertIn({'start': 0, 'end': 26, 'name': 'bold', 'value': 'True'}, annotations[47])
- self.assertIn({'start': 0, 'end': 29, 'name': 'italic', 'value': 'True'}, annotations[48])
- self.assertIn({'start': 0, 'end': 32, 'name': 'underlined', 'value': 'True'}, annotations[49])
- self.assertIn({'start': 0, 'end': 35, 'name': 'bold', 'value': 'True'}, annotations[50])
- self.assertIn({'start': 0, 'end': 35, 'name': 'italic', 'value': 'True'}, annotations[50])
- self.assertIn({'start': 0, 'end': 51, 'name': 'bold', 'value': 'True'}, annotations[51])
- self.assertIn({'start': 0, 'end': 51, 'name': 'underlined', 'value': 'True'}, annotations[51])
- self.assertIn({'start': 0, 'end': 51, 'name': 'italic', 'value': 'True'}, annotations[51])
+ self.assertIn({"start": 0, "end": 26, "name": "bold", "value": "True"}, annotations[47])
+ self.assertIn({"start": 0, "end": 29, "name": "italic", "value": "True"}, annotations[48])
+ self.assertIn({"start": 0, "end": 32, "name": "underlined", "value": "True"}, annotations[49])
+ self.assertIn({"start": 0, "end": 35, "name": "bold", "value": "True"}, annotations[50])
+ self.assertIn({"start": 0, "end": 35, "name": "italic", "value": "True"}, annotations[50])
+ self.assertIn({"start": 0, "end": 51, "name": "bold", "value": "True"}, annotations[51])
+ self.assertIn({"start": 0, "end": 51, "name": "underlined", "value": "True"}, annotations[51])
+ self.assertIn({"start": 0, "end": 51, "name": "italic", "value": "True"}, annotations[51])
def test_spacing_1(self) -> None:
result = self._send_request("spacing_libreoffice.docx", data={"structure_type": "linear"})
- subparagraphs = result['content']['structure']['subparagraphs']
- annotations = [subparagraph['annotations'] for subparagraph in subparagraphs]
-
- self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '0'}, annotations[0])
- self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '0'}, annotations[1])
- self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '57'}, annotations[2])
- self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '114'}, annotations[3])
- self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '114'}, annotations[4])
- self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '114'}, annotations[5])
- self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '114'}, annotations[6])
- self.assertIn({'start': 0, 'end': 9, 'name': 'spacing', 'value': '0'}, annotations[7])
+ subparagraphs = result["content"]["structure"]["subparagraphs"]
+ annotations = [subparagraph["annotations"] for subparagraph in subparagraphs]
+
+ self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "0"}, annotations[0])
+ self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "0"}, annotations[1])
+ self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "57"}, annotations[2])
+ self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "114"}, annotations[3])
+ self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "114"}, annotations[4])
+ self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "114"}, annotations[5])
+ self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "114"}, annotations[6])
+ self.assertIn({"start": 0, "end": 9, "name": "spacing", "value": "0"}, annotations[7])
def test_spacing_2(self) -> None:
result = self._send_request("spacing_microsoft_word.docx", data={"structure_type": "linear"})
- subparagraphs = result['content']['structure']['subparagraphs']
- annotations = [subparagraph['annotations'] for subparagraph in subparagraphs]
-
- self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '0'}, annotations[0])
- self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '0'}, annotations[1])
- self.assertIn({'start': 0, 'end': 31, 'name': 'spacing', 'value': '200'}, annotations[2])
- self.assertIn({'start': 0, 'end': 31, 'name': 'spacing', 'value': '200'}, annotations[3])
- self.assertIn({'start': 0, 'end': 32, 'name': 'spacing', 'value': '400'}, annotations[4])
- self.assertIn({'start': 0, 'end': 31, 'name': 'spacing', 'value': '400'}, annotations[5])
- self.assertIn({'start': 0, 'end': 31, 'name': 'spacing', 'value': '600'}, annotations[6])
- self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '400'}, annotations[7])
- self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '0'}, annotations[8])
+ subparagraphs = result["content"]["structure"]["subparagraphs"]
+ annotations = [subparagraph["annotations"] for subparagraph in subparagraphs]
+
+ self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "0"}, annotations[0])
+ self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "0"}, annotations[1])
+ self.assertIn({"start": 0, "end": 31, "name": "spacing", "value": "200"}, annotations[2])
+ self.assertIn({"start": 0, "end": 31, "name": "spacing", "value": "200"}, annotations[3])
+ self.assertIn({"start": 0, "end": 32, "name": "spacing", "value": "400"}, annotations[4])
+ self.assertIn({"start": 0, "end": 31, "name": "spacing", "value": "400"}, annotations[5])
+ self.assertIn({"start": 0, "end": 31, "name": "spacing", "value": "600"}, annotations[6])
+ self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "400"}, annotations[7])
+ self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "0"}, annotations[8])
def test_identation(self) -> None:
result = self._send_request("indentation_libreoffice.docx", data={"structure_type": "linear"})
- subparagraphs = result['content']['structure']['subparagraphs']
- annotations = [subparagraph['annotations'] for subparagraph in subparagraphs]
- self.assertIn({'start': 0, 'end': 188, 'name': 'indentation', 'value': '360.0'}, annotations[5])
- self.assertIn({'start': 0, 'end': 152, 'name': 'indentation', 'value': '708.0'}, annotations[10])
- self.assertIn({'start': 0, 'end': 0, 'name': 'indentation', 'value': '1429.0'}, annotations[12])
- self.assertIn({'start': 0, 'end': 21, 'name': 'indentation', 'value': '709.0'}, annotations[16])
- self.assertIn({'start': 0, 'end': 65, 'name': 'indentation', 'value': '786.0'}, annotations[20])
+ subparagraphs = result["content"]["structure"]["subparagraphs"]
+ annotations = [subparagraph["annotations"] for subparagraph in subparagraphs]
+ self.assertIn({"start": 0, "end": 188, "name": "indentation", "value": "360.0"}, annotations[5])
+ self.assertIn({"start": 0, "end": 152, "name": "indentation", "value": "708.0"}, annotations[10])
+ self.assertIn({"start": 0, "end": 0, "name": "indentation", "value": "1429.0"}, annotations[12])
+ self.assertIn({"start": 0, "end": 21, "name": "indentation", "value": "709.0"}, annotations[16])
+ self.assertIn({"start": 0, "end": 65, "name": "indentation", "value": "786.0"}, annotations[20])
def test_table_refs(self) -> None:
result = self._send_request("table_refs.docx", data={"structure_type": "linear"})
- subparagraphs = result['content']['structure']['subparagraphs']
+ subparagraphs = result["content"]["structure"]["subparagraphs"]
for i in [0, 2, 4, 6, 9]:
- annotations = subparagraphs[i]['annotations']
+ annotations = subparagraphs[i]["annotations"]
found = False
for annotation in annotations:
if annotation["name"] == "table":
diff --git a/tests/api_tests/test_api_format_email.py b/tests/api_tests/test_api_format_email.py
index 055c86fe..0846538f 100644
--- a/tests/api_tests/test_api_format_email.py
+++ b/tests/api_tests/test_api_format_email.py
@@ -14,7 +14,7 @@ def test_email_file(self) -> None:
attachments = result["attachments"]
self.assertEqual(len(attachments), 1) # message header fields
- self.assertIn("message_header_", attachments[0]['metadata']['file_name'])
+ self.assertIn("message_header_", attachments[0]["metadata"]["file_name"])
content = result["content"]
structure = content["structure"]
self._check_tree_sanity(structure)
diff --git a/tests/api_tests/test_api_format_excel.py b/tests/api_tests/test_api_format_excel.py
index 5cc06bca..aec440c3 100644
--- a/tests/api_tests/test_api_format_excel.py
+++ b/tests/api_tests/test_api_format_excel.py
@@ -72,19 +72,19 @@ def __check_content_formulas(self, tables: List[dict]) -> None:
self.assertEqual(2, len(tables))
table1, table2 = (table["cells"] for table in tables)
- self.assertListEqual(['a', 'b', 'c'], table1[0])
- self.assertListEqual(['1.0', '2.0', '3.0'], table1[1])
- self.assertListEqual(['3.0', '4.0', '7.0'], table1[2])
- self.assertListEqual(['2.0', '3.0', '5.0'], table1[3])
- self.assertListEqual(['5.0', '6.0', '11.0'], table1[4])
- self.assertListEqual(['7.0', '33.0', '40.0'], table1[5])
-
- self.assertListEqual(['r', 'p', 's', 'pi'], table2[0])
- self.assertListEqual(['1.0', '6.28', '3.14', '3.14'], table2[1])
- self.assertListEqual(['2.0', '12.56', '12.56', ''], table2[2])
- self.assertListEqual(['3.0', '18.84', '28.26', ''], table2[3])
- self.assertListEqual(['4.0', '25.12', '50.24', ''], table2[4])
- self.assertListEqual(['5.0', '31.4', '78.5', ''], table2[5])
- self.assertListEqual(['6.0', '37.68', '113.04', ''], table2[6])
- self.assertListEqual(['7.0', '43.96', '153.86', ''], table2[7])
- self.assertListEqual(['8.0', '50.24', '200.96', ''], table2[8])
+ self.assertListEqual(["a", "b", "c"], table1[0])
+ self.assertListEqual(["1.0", "2.0", "3.0"], table1[1])
+ self.assertListEqual(["3.0", "4.0", "7.0"], table1[2])
+ self.assertListEqual(["2.0", "3.0", "5.0"], table1[3])
+ self.assertListEqual(["5.0", "6.0", "11.0"], table1[4])
+ self.assertListEqual(["7.0", "33.0", "40.0"], table1[5])
+
+ self.assertListEqual(["r", "p", "s", "pi"], table2[0])
+ self.assertListEqual(["1.0", "6.28", "3.14", "3.14"], table2[1])
+ self.assertListEqual(["2.0", "12.56", "12.56", ""], table2[2])
+ self.assertListEqual(["3.0", "18.84", "28.26", ""], table2[3])
+ self.assertListEqual(["4.0", "25.12", "50.24", ""], table2[4])
+ self.assertListEqual(["5.0", "31.4", "78.5", ""], table2[5])
+ self.assertListEqual(["6.0", "37.68", "113.04", ""], table2[6])
+ self.assertListEqual(["7.0", "43.96", "153.86", ""], table2[7])
+ self.assertListEqual(["8.0", "50.24", "200.96", ""], table2[8])
diff --git a/tests/api_tests/test_api_format_html.py b/tests/api_tests/test_api_format_html.py
index 5df163c7..8dd3a748 100644
--- a/tests/api_tests/test_api_format_html.py
+++ b/tests/api_tests/test_api_format_html.py
@@ -1,7 +1,6 @@
import os
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
-
from tests.api_tests.abstract_api_test import AbstractTestApiDocReader
@@ -53,10 +52,10 @@ def __check_example_file(self, result: dict, file_name: str) -> None:
node = self._get_by_tree_path(tree, "0.0.0.1.0")
self.assertEqual("raw_text", node["metadata"]["paragraph_type"])
self.assertEqual("Определим определения \nТекст ", node["text"].strip()[:30])
- self.assertIn({'start': 1, 'end': 31, 'name': 'bold', 'value': 'True'}, node["annotations"])
- self.assertIn({'start': 46, 'end': 52, 'name': 'bold', 'value': 'True'}, node["annotations"])
- self.assertIn({'start': 42, 'end': 45, 'name': 'underlined', 'value': 'True'}, node["annotations"])
- self.assertIn({'start': 32, 'end': 42, 'name': 'italic', 'value': 'True'}, node["annotations"])
+ self.assertIn({"start": 1, "end": 31, "name": "bold", "value": "True"}, node["annotations"])
+ self.assertIn({"start": 46, "end": 52, "name": "bold", "value": "True"}, node["annotations"])
+ self.assertIn({"start": 42, "end": 45, "name": "underlined", "value": "True"}, node["annotations"])
+ self.assertIn({"start": 32, "end": 42, "name": "italic", "value": "True"}, node["annotations"])
node = self._get_by_tree_path(tree, "0.0.0.2")
self.assertEqual("header", node["metadata"]["paragraph_type"])
@@ -109,7 +108,7 @@ def __check_example_file(self, result: dict, file_name: str) -> None:
self.assertListEqual(["Петров", "Пётр", "Петрович"], table2["cells"][2])
self.assertListEqual(["Сидоров", "Сидор", "Сидорович"], table2["cells"][3])
- self.__check_metainfo(result['metadata'], 'text/html', file_name)
+ self.__check_metainfo(result["metadata"], "text/html", file_name)
def test_part_html(self) -> None:
file_name = "part.html"
@@ -118,11 +117,9 @@ def test_part_html(self) -> None:
content = result["content"]["structure"]
self._check_tree_sanity(content)
self.assertEqual("Лесные слоны", content["subparagraphs"][0]["text"].strip())
- self.assertEqual("В данном разделе мы поговорим о малоизвестных лесных слонах...",
- content["subparagraphs"][0]["subparagraphs"][0]["text"].strip())
+ self.assertEqual("В данном разделе мы поговорим о малоизвестных лесных слонах...", content["subparagraphs"][0]["subparagraphs"][0]["text"].strip())
self.assertEqual("Среда обитания", content["subparagraphs"][0]["subparagraphs"][1]["text"].strip())
- self.assertEqual("Лесные слоны живут не на деревьях, а под ними.",
- content["subparagraphs"][0]["subparagraphs"][1]["subparagraphs"][0]["text"].strip())
+ self.assertEqual("Лесные слоны живут не на деревьях, а под ними.", content["subparagraphs"][0]["subparagraphs"][1]["subparagraphs"][0]["text"].strip())
def test_plain_text_html(self) -> None:
file_name = "plain.html"
@@ -173,15 +170,15 @@ def test_html_newlines(self) -> None:
self.assertIn("For service repair (Part 145) returned material authorizations (RMA):", text)
def __check_metainfo(self, metainfo: dict, actual_type: str, actual_name: str) -> None:
- self.assertEqual(metainfo['file_type'], actual_type)
- self.assertEqual(metainfo['file_name'], actual_name)
+ self.assertEqual(metainfo["file_type"], actual_type)
+ self.assertEqual(metainfo["file_name"], actual_name)
def test_html_encoding(self) -> None:
file_name = "53.html"
result = self._send_request(file_name)
content = result["content"]["structure"]
text = content["subparagraphs"][0]["text"]
- self.assertTrue(text.startswith('\n\n'))
+ self.assertTrue(text.startswith("\n\n"))
def test_html_no_newline(self) -> None:
file_name = "no_new_line.html"
@@ -192,8 +189,7 @@ def test_html_no_newline(self) -> None:
expected_text = ('"I can’t bring myself to feel too sorry for Amazon or textbook publishers, given how much '
'they tend to gouge on the prices of those books."')
self.assertEqual(expected_text, text.strip())
- italics = [text[annotation["start"]: annotation["end"]] for annotation in node["annotations"] if
- annotation["name"] == "italic"]
+ italics = [text[annotation["start"]: annotation["end"]] for annotation in node["annotations"] if annotation["name"] == "italic"]
self.assertIn("or", italics)
def test_html_none_display(self) -> None:
diff --git a/tests/api_tests/test_api_format_json.py b/tests/api_tests/test_api_format_json.py
index 99968320..72128afc 100644
--- a/tests/api_tests/test_api_format_json.py
+++ b/tests/api_tests/test_api_format_json.py
@@ -81,7 +81,7 @@ def test_broken(self) -> None:
def test_json_attachments2(self) -> None:
file_name = "test2.json"
- data = {'html_fields': '[["e"], ["f"]]', 'with_attachments': 'True', 'return_base64': 'true'}
+ data = {"html_fields": '[["e"], ["f"]]', "with_attachments": "True", "return_base64": "true"}
self._send_request(file_name, expected_code=200, data=data)
def test_json_null(self) -> None:
@@ -104,7 +104,7 @@ def test_json_null(self) -> None:
def test_json_broken_parameters(self) -> None:
file_name = "test2.json"
- data = {'html_fields': '[[ef]]', 'with_attachments': 'True', 'return_base64': 'true'}
+ data = {"html_fields": "[[ef]]", "with_attachments": "True", "return_base64": "true"}
with self.assertRaises(JSONDecodeError):
json.loads(data["html_fields"])
self._send_request(file_name, expected_code=400, data=data)
diff --git a/tests/api_tests/test_api_format_pdf.py b/tests/api_tests/test_api_format_pdf.py
index 2e49beba..5f766472 100644
--- a/tests/api_tests/test_api_format_pdf.py
+++ b/tests/api_tests/test_api_format_pdf.py
@@ -1,10 +1,10 @@
import os
-from tests.api_tests.abstract_api_test import AbstractTestApiDocReader
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation
from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation
from dedoc.utils import supported_image_types
+from tests.api_tests.abstract_api_test import AbstractTestApiDocReader
class TestApiPdfReader(AbstractTestApiDocReader):
@@ -23,26 +23,25 @@ def __check_example_file(self, result: dict) -> None:
self._check_similarity("1.2.1 Поясним за непонятное", content[3]["subparagraphs"][0]["text"])
def __check_metainfo(self, metainfo: dict, actual_type: str, actual_name: str) -> None:
- self.assertEqual(metainfo['file_type'], actual_type)
- self.assertEqual(metainfo['file_name'], actual_name)
+ self.assertEqual(metainfo["file_type"], actual_type)
+ self.assertEqual(metainfo["file_name"], actual_name)
def test_pdf(self) -> None:
file_name = "example.pdf"
result = self._send_request(file_name, data=dict(with_attachments=True, document_type="", pdf_with_text_layer="false"))
self.__check_example_file(result)
- self.__check_metainfo(result['metadata'], 'application/pdf', file_name)
- self.assertEqual([], result['attachments'])
+ self.__check_metainfo(result["metadata"], "application/pdf", file_name)
+ self.assertEqual([], result["attachments"])
def test_djvu(self) -> None:
file_name = "example_with_table7.djvu"
result = self._send_request(file_name, dict(document_type=""))
tree = result["content"]["structure"]
self._check_tree_sanity(tree)
- self.assertEqual('2. Срок поставки в течении 70 дней с момента внесения авансового платежа.\n',
- self._get_by_tree_path(tree, "0.2.1")['text'])
- self.assertEqual("3. Срок изготовления не ранее 2018г.\n", self._get_by_tree_path(tree, "0.2.2")['text'])
+ self.assertEqual("2. Срок поставки в течении 70 дней с момента внесения авансового платежа.\n", self._get_by_tree_path(tree, "0.2.1")["text"])
+ self.assertEqual("3. Срок изготовления не ранее 2018г.\n", self._get_by_tree_path(tree, "0.2.2")["text"])
- self.__check_metainfo(result['metadata'], 'image/vnd.djvu', file_name)
+ self.__check_metainfo(result["metadata"], "image/vnd.djvu", file_name)
def test_djvu_2(self) -> None:
file_name = "example_with_table9.djvu"
@@ -52,7 +51,7 @@ def test_djvu_2(self) -> None:
self.assertEqual("1. Предмет закупки, источник финансирования :\n", self._get_by_tree_path(content, "0.1.0")["text"])
self.assertEqual("2. Место выполнения Работ:\n", self._get_by_tree_path(content, "0.1.1")["text"])
- self.__check_metainfo(result['metadata'], 'image/vnd.djvu', file_name)
+ self.__check_metainfo(result["metadata"], "image/vnd.djvu", file_name)
def test_broken_djvu(self) -> None:
file_name = "broken.djvu"
@@ -75,13 +74,13 @@ def test_header_pdf(self) -> None:
self._check_similarity("4.5. п", self._get_by_tree_path(tree, "0.1.3.0.4")["text"])
self._check_similarity("4.6. п", self._get_by_tree_path(tree, "0.1.3.0.5")["text"])
- self.__check_metainfo(result['metadata'], 'application/pdf', file_name)
+ self.__check_metainfo(result["metadata"], "application/pdf", file_name)
def test_images(self) -> None:
formats = [
- '.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif', '.ppm', '.pnm', '.pgm',
- '.pbm', '.webp', '.pcx', '.eps', '.sgi', '.hdr', '.pic', '.sr', '.ras',
- '.dib', '.jpe', '.jfif', '.j2k'
+ ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif", ".ppm", ".pnm", ".pgm",
+ ".pbm", ".webp", ".pcx", ".eps", ".sgi", ".hdr", ".pic", ".sr", ".ras",
+ ".dib", ".jpe", ".jfif", ".j2k"
]
for image_format in formats:
@@ -101,35 +100,34 @@ def test_image_binarization(self) -> None:
result = self._send_request("01_МФО_Наклон.jpg", data=dict(need_binarization="true"))
self.assertIn("ЦЕНТРАЛЬНЫЙ БАНК РОССИЙСКОЙ ФЕДЕРАЦИИ\n{БАНК РОССИИ)\nСВИДЕТЕЛЬСТВО\nО ВНЕСЕНИИ СВЕДЕНИЙ О ЮРИДИЧЕСКОМ ЛИЦЕ\n"
- "В ГОСУДАРСТВЕННЫЙ РЕЕСТР МИКРОФИНАНСОВЫХ ОРГАНИЗАЦИЙ", result['content']['structure']['subparagraphs'][0]['text'])
- self.assertIn("Е.И Курицына\n(расшифровка подлиси", result['content']['structure']['subparagraphs'][1]['text'])
+ "В ГОСУДАРСТВЕННЫЙ РЕЕСТР МИКРОФИНАНСОВЫХ ОРГАНИЗАЦИЙ", result["content"]["structure"]["subparagraphs"][0]["text"])
+ self.assertIn("Е.И Курицына\n(расшифровка подлиси", result["content"]["structure"]["subparagraphs"][1]["text"])
def test_on_ocr_conf_threshold(self) -> None:
result = self._send_request("with_trash.jpg", data=dict(structure_type="tree"))
tree = result["content"]["structure"]
self._check_tree_sanity(tree)
# check, that handwritten text was filtered
- self._check_similarity('ФИО года рождения, паспорт: серия \n№ выдан _, дата выдачи\n'
- 'т. код подразделения зарегистрированный по адресу:\n \n', tree['subparagraphs'][3]['text'])
+ self._check_similarity("ФИО года рождения, паспорт: серия \n№ выдан _, дата выдачи\nт. код подразделения зарегистрированный по адресу:\n \n",
+ tree["subparagraphs"][3]["text"])
def test_rotated_image(self) -> None:
result = self._send_request("orient_1.png", data=dict(need_pdf_table_analysis="false"))
tree = result["content"]["structure"]
self._check_tree_sanity(tree)
- self._check_similarity(tree['subparagraphs'][0]['text'], 'Приложение к Положению о порядке\n'
- 'формирования, ведения и утверждения\n'
- 'ведомственных перечней государственных услуг\n'
- 'и работ, оказываемых и выполняемых\n'
- 'государственными учреждениями Калужской\n'
- 'области\n')
+ self._check_similarity(tree["subparagraphs"][0]["text"], "Приложение к Положению о порядке\n"
+ "формирования, ведения и утверждения\n"
+ "ведомственных перечней государственных услуг\n"
+ "и работ, оказываемых и выполняемых\n"
+ "государственными учреждениями Калужской\n"
+ "области\n")
def test_pdf_with_only_mp_table(self) -> None:
file_name = os.path.join("..", "tables", "multipage_table.pdf")
result = self._send_request(file_name)
- table_refs = [ann["value"] for ann in result["content"]["structure"]["subparagraphs"][0]["annotations"]
- if ann["name"] == "table"]
+ table_refs = [ann["value"] for ann in result["content"]["structure"]["subparagraphs"][0]["annotations"] if ann["name"] == "table"]
self.assertTrue(len(result["content"]["tables"]), len(table_refs))
for table in result["content"]["tables"]:
@@ -145,15 +143,14 @@ def test_pdf_with_some_tables(self) -> None:
# checks indentations
par = self._get_by_tree_path(tree, "0.4.0.0")
annotations = par["annotations"]
- self.assertIn({"end": 170, 'value': '600', 'name': 'indentation', 'start': 0}, annotations)
+ self.assertIn({"end": 170, "value": "600", "name": "indentation", "start": 0}, annotations)
self.assertIn("Методика расчета ВВП по доходам характеризуется суммой национального\n", par["text"])
def test_pdf_with_only_table(self) -> None:
file_name = os.path.join("..", "pdf_with_text_layer", "VVP_global_table.pdf")
result = self._send_request(file_name)
- self.assertTrue(result["content"]["tables"][0]["metadata"]["uid"] ==
- result["content"]["structure"]["subparagraphs"][0]["annotations"][0]["value"])
+ self.assertEqual(result["content"]["tables"][0]["metadata"]["uid"], result["content"]["structure"]["subparagraphs"][0]["annotations"][0]["value"])
def test_2_columns(self) -> None:
file_name = os.path.join("..", "scanned", "example_2_columns.png")
@@ -166,8 +163,8 @@ def test_document_orientation(self) -> None:
file_name = "orient_3.png"
result = self._send_request(file_name, data=dict(document_orientation="auto"))
tree = result["content"]["structure"]
- self._check_similarity(tree['subparagraphs'][0]['text'], 'Приложение к постановлению\n'
- 'Губернатора Камчатского края\n'
- '0729.12.2014 № 168\n'
+ self._check_similarity(tree["subparagraphs"][0]["text"], "Приложение к постановлению\n"
+ "Губернатора Камчатского края\n"
+ "0729.12.2014 № 168\n"
'"БУРЫЙ МЕДВЕДЬ\n'
- '{вид охотничьих ресурсов)\n')
+ "{вид охотничьих ресурсов)\n")
diff --git a/tests/api_tests/test_api_format_pdf_page_limit.py b/tests/api_tests/test_api_format_pdf_page_limit.py
index d2533d64..ee1bf841 100644
--- a/tests/api_tests/test_api_format_pdf_page_limit.py
+++ b/tests/api_tests/test_api_format_pdf_page_limit.py
@@ -65,9 +65,9 @@ def __check(self, pages: str, text_expected: str, reader: str, check_partially:
result = self._send_request("multipage.pdf", params)
if check_partially:
self.assertIn("The document is partially parsed", result["warnings"])
- self.assertIn('first_page', result['metadata'])
- self.assertIn('last_page', result['metadata'])
+ self.assertIn("first_page", result["metadata"])
+ self.assertIn("last_page", result["metadata"])
tree = result["content"]["structure"]
node = self._get_by_tree_path(tree, "0.0")
text = node["text"].strip()
- self.assertEqual(text_expected, text, "{} and {}".format(pages, reader))
+ self.assertEqual(text_expected, text, f"{pages} and {reader}")
diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py
index b273fef8..67d66b5a 100644
--- a/tests/api_tests/test_api_format_pdf_tabby_reader.py
+++ b/tests/api_tests/test_api_format_pdf_tabby_reader.py
@@ -95,26 +95,26 @@ def test_pdf_with_text_style(self) -> None:
tree = result["content"]["structure"]
self._check_tree_sanity(tree=tree)
sub1 = self._get_by_tree_path(tree, "0.0.0")
- self.assertEqual('1.1 TimesNewRomanItalicBold20\n', sub1["text"])
- self.assertIn({'start': 0, 'end': 29, "name": "size", 'value': '20'}, sub1['annotations'])
+ self.assertEqual("1.1 TimesNewRomanItalicBold20\n", sub1["text"])
+ self.assertIn({"start": 0, "end": 29, "name": "size", "value": "20"}, sub1["annotations"])
sub1sub1 = self._get_by_tree_path(tree, "0.0.0.0")
- self.assertEqual('Different styles(Arial16):\n', sub1sub1['text'])
- self.assertIn({'start': 0, 'end': 26, "name": "size", 'value': '15'}, sub1sub1['annotations'])
+ self.assertEqual("Different styles(Arial16):\n", sub1sub1["text"])
+ self.assertIn({"start": 0, "end": 26, "name": "size", "value": "15"}, sub1sub1["annotations"])
sub2 = self._get_by_tree_path(tree, "0.1.0")
- self.assertEqual('1. TimesNewRoman18\n', sub2['text'])
- self.assertIn({'start': 3, 'end': 18, "name": "size", 'value': '18'}, sub2['annotations'])
+ self.assertEqual("1. TimesNewRoman18\n", sub2["text"])
+ self.assertIn({"start": 3, "end": 18, "name": "size", "value": "18"}, sub2["annotations"])
sub3 = self._get_by_tree_path(tree, "0.1.1")
- self.assertEqual('2. TimesNewRoman9, TimesNewRomanBold7.5, TimesNewRoman6.5\n', sub3['text'])
- self.assertIn({'start': 3, 'end': 18, "name": "size", 'value': '9'}, sub3['annotations'])
- self.assertIn({'start': 19, 'end': 57, "name": "size", 'value': '6'}, sub3['annotations'])
+ self.assertEqual("2. TimesNewRoman9, TimesNewRomanBold7.5, TimesNewRoman6.5\n", sub3["text"])
+ self.assertIn({"start": 3, "end": 18, "name": "size", "value": "9"}, sub3["annotations"])
+ self.assertIn({"start": 19, "end": 57, "name": "size", "value": "6"}, sub3["annotations"])
sub4 = self._get_by_tree_path(tree, "0.1.2")
- self.assertEqual('3. TimesNewRomanItalic14, Calibri18, Tahoma16\n', sub4['text'])
- self.assertIn({'start': 3, 'end': 25, "name": "size", 'value': '14'}, sub4['annotations'])
- self.assertIn({'start': 26, 'end': 36, "name": "size", 'value': '18'}, sub4['annotations'])
+ self.assertEqual("3. TimesNewRomanItalic14, Calibri18, Tahoma16\n", sub4["text"])
+ self.assertIn({"start": 3, "end": 25, "name": "size", "value": "14"}, sub4["annotations"])
+ self.assertIn({"start": 26, "end": 36, "name": "size", "value": "18"}, sub4["annotations"])
def test_tables2(self) -> None:
file_name = "VVP_global_table.pdf"
diff --git a/tests/api_tests/test_api_format_pdf_with_text.py b/tests/api_tests/test_api_format_pdf_with_text.py
index 54265beb..407d97af 100644
--- a/tests/api_tests/test_api_format_pdf_with_text.py
+++ b/tests/api_tests/test_api_format_pdf_with_text.py
@@ -16,71 +16,69 @@ def __filter_by_name(self, annotations: List[dict], name: str) -> List[dict]:
@unittest.skip("TODO")
def test_pdf_with_text_style(self) -> None:
file_name = "diff_styles.pdf"
- result = self._send_request(file_name, dict(pdf_with_text_layer="true", document_type="",
- need_pdf_table_analysis="false"))
+ result = self._send_request(file_name, dict(pdf_with_text_layer="true", document_type="", need_pdf_table_analysis="false"))
tree = result["content"]["structure"]
self._check_tree_sanity(tree)
node = self._get_by_tree_path(tree, "0.0")
- self.assertEqual('1.1TimesNewRomanItalicBold20\n', node['text'])
- self.assertIn({'start': 0, 'end': 28, "name": "size", 'value': '20.0'}, node['annotations'])
+ self.assertEqual("1.1TimesNewRomanItalicBold20\n", node["text"])
+ self.assertIn({"start": 0, "end": 28, "name": "size", "value": "20.0"}, node["annotations"])
node = self._get_by_tree_path(tree, "0.1")
- annotations_size = self.__filter_by_name(name="size", annotations=node['annotations'])
- self.assertIn({'start': 0, 'end': 26, "name": "size", 'value': '16.0'}, annotations_size)
- self.assertEqual(len(node['annotations']), 5)
+ annotations_size = self.__filter_by_name(name="size", annotations=node["annotations"])
+ self.assertIn({"start": 0, "end": 26, "name": "size", "value": "16.0"}, annotations_size)
+ self.assertEqual(len(node["annotations"]), 5)
self.assertEqual("Different styles(Arial16):\n", node["text"])
node = self._get_by_tree_path(tree, "0.2.2")
- self.assertEqual('3. TimesNewRomanItalic14, Calibri18, Tahoma16\n', node['text'])
- self.assertEqual('3. ', node['text'][0:3])
- self.assertIn({'start': 0, 'end': 36, 'name': "style", 'value': 'TimesNewRomanPSMT'}, node['annotations'])
- self.assertIn({'start': 0, 'end': 2, "name": "size", 'value': '16.0'}, node['annotations'])
- self.assertEqual('TimesNewRomanItalic14, ', node['text'][3:26])
- self.assertIn({'start': 0, 'end': 36, "name": "style", 'value': 'TimesNewRomanPSMT'}, node['annotations'])
- self.assertIn({'start': 3, 'end': 25, "name": "size", 'value': '14.0'}, node['annotations'])
- self.assertEqual('Calibri18, ', node['text'][26:37])
- self.assertIn({'start': 0, 'end': 36, "name": "style", 'value': 'TimesNewRomanPSMT'}, node['annotations'])
- self.assertIn({'start': 26, 'end': 36, 'value': '18.0', "name": "size"}, node['annotations'])
- self.assertEqual('Tahoma16\n', node['text'][37:46])
- self.assertIn({'start': 37, 'end': 45, 'value': 'Tahoma', "name": "style"}, node['annotations'])
- self.assertIn({'start': 37, 'end': 45, "name": "size", 'value': '16.0'}, node['annotations'])
- self.assertEqual(9, len(node['annotations']))
+ self.assertEqual("3. TimesNewRomanItalic14, Calibri18, Tahoma16\n", node["text"])
+ self.assertEqual("3. ", node["text"][0:3])
+ self.assertIn({"start": 0, "end": 36, "name": "style", "value": "TimesNewRomanPSMT"}, node["annotations"])
+ self.assertIn({"start": 0, "end": 2, "name": "size", "value": "16.0"}, node["annotations"])
+ self.assertEqual("TimesNewRomanItalic14, ", node["text"][3:26])
+ self.assertIn({"start": 0, "end": 36, "name": "style", "value": "TimesNewRomanPSMT"}, node["annotations"])
+ self.assertIn({"start": 3, "end": 25, "name": "size", "value": "14.0"}, node["annotations"])
+ self.assertEqual("Calibri18, ", node["text"][26:37])
+ self.assertIn({"start": 0, "end": 36, "name": "style", "value": "TimesNewRomanPSMT"}, node["annotations"])
+ self.assertIn({"start": 26, "end": 36, "value": "18.0", "name": "size"}, node["annotations"])
+ self.assertEqual("Tahoma16\n", node["text"][37:46])
+ self.assertIn({"start": 37, "end": 45, "value": "Tahoma", "name": "style"}, node["annotations"])
+ self.assertIn({"start": 37, "end": 45, "name": "size", "value": "16.0"}, node["annotations"])
+ self.assertEqual(9, len(node["annotations"]))
def test_pdf_with_text_style_2(self) -> None:
file_name = "2-column-state.pdf"
result = self._send_request(file_name, dict(pdf_with_text_layer="true", need_pdf_table_analysis="false"))
tree = result["content"]["structure"]
self._check_tree_sanity(tree)
- subs = tree['subparagraphs']
+ subs = tree["subparagraphs"]
sub = self._get_by_tree_path(tree, "0.0")
- self.assertEqual("Compromising Tor Anonymity\n", sub['text'][0:27])
- annotations_size = self.__filter_by_name(name="size", annotations=subs[0]['annotations'])
- self.assertIn({'start': 0, 'end': 61, "name": "size", 'value': '18.0'}, annotations_size)
+ self.assertEqual("Compromising Tor Anonymity\n", sub["text"][0:27])
+ annotations_size = self.__filter_by_name(name="size", annotations=subs[0]["annotations"])
+ self.assertIn({"start": 0, "end": 61, "name": "size", "value": "18.0"}, annotations_size)
- annotations_style = self.__filter_by_name(name="style", annotations=subs[0]['annotations'])
- self.assertIn({'start': 0, 'end': 61, 'name': 'style', 'value': 'Helvetica-Bold'}, annotations_style)
+ annotations_style = self.__filter_by_name(name="style", annotations=subs[0]["annotations"])
+ self.assertIn({"start": 0, "end": 61, "name": "style", "value": "Helvetica-Bold"}, annotations_style)
- annotations_bold = self.__filter_by_name(name="bold", annotations=subs[0]['annotations'])
- self.assertIn({'start': 0, 'end': 61, 'name': 'bold', 'value': "True"}, annotations_bold)
+ annotations_bold = self.__filter_by_name(name="bold", annotations=subs[0]["annotations"])
+ self.assertIn({"start": 0, "end": 61, "name": "bold", "value": "True"}, annotations_bold)
self.assertIn("Pere Manils, Abdelberi Chaabane, Stevens Le Blond,", self._get_by_tree_path(tree, "0.1")["text"])
@unittest.skip("TODO")
def test_pdf_with_2_columns_text(self) -> None:
file_name = "2-column-state.pdf"
- result = self._send_request(file_name, dict(pdf_with_text_layer="true", document_type="",
- need_pdf_table_analysis="false"))
+ result = self._send_request(file_name, dict(pdf_with_text_layer="true", document_type="", need_pdf_table_analysis="false"))
tree = result["content"]["structure"]
self._check_tree_sanity(tree)
self.assertIn("Privacy of users in P2P networks goes far beyond their\n"
"current usage and is a fundamental requirement to the adop-\n"
"tion of P2P protocols for legal usage. In a climate of cold",
- self._get_by_tree_path(tree, "0.5")['text'])
+ self._get_by_tree_path(tree, "0.5")["text"])
- self.assertIn("Keywords", self._get_by_tree_path(tree, "0.6")['text'])
- self.assertIn("Anonymizing Networks, Privacy, Tor, BitTorrent", self._get_by_tree_path(tree, "0.7")['text'])
+ self.assertIn("Keywords", self._get_by_tree_path(tree, "0.6")["text"])
+ self.assertIn("Anonymizing Networks, Privacy, Tor, BitTorrent", self._get_by_tree_path(tree, "0.7")["text"])
self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.8.0.0")["text"])
self.assertIn("The Tor network was designed to provide freedom\n"
@@ -96,8 +94,7 @@ def test_pdf_with_2_columns_text_2(self) -> None:
tree = result["content"]["structure"]
self.assertIn("References", self._get_by_tree_path(tree, "0.0")["text"])
- self.assertIn("[1] Navaneeth Bodla, Bharat Singh, Rama Chellappa, and",
- self._get_by_tree_path(tree, "0.1")["text"])
+ self.assertIn("[1] Navaneeth Bodla, Bharat Singh, Rama Chellappa, and", self._get_by_tree_path(tree, "0.1")["text"])
def test_pdf_with_some_tables(self) -> None:
file_name = "VVP_6_tables.pdf"
@@ -109,22 +106,20 @@ def test_pdf_with_some_tables(self) -> None:
# checks indentations
par = self._get_by_tree_path(tree, "0.4.0.0")
- self.assertIn({'end': 170, 'value': '600', 'name': 'indentation', 'start': 0}, par["annotations"])
+ self.assertIn({"end": 170, "value": "600", "name": "indentation", "start": 0}, par["annotations"])
self.assertIn("Методика расчета ВВП по доходам характеризуется суммой национального\n", par["text"])
def test_pdf_with_only_table(self) -> None:
file_name = "VVP_global_table.pdf"
result = self._send_request(file_name, dict(pdf_with_text_layer="true"))
- self.assertTrue(result["content"]["tables"][0]["metadata"]["uid"] ==
- result["content"]["structure"]["subparagraphs"][0]["annotations"][0]["value"])
+ self.assertEqual(result["content"]["tables"][0]["metadata"]["uid"], result["content"]["structure"]["subparagraphs"][0]["annotations"][0]["value"])
def test_pdf_with_only_mp_table(self) -> None:
file_name = os.path.join("..", "tables", "multipage_table.pdf")
result = self._send_request(file_name, dict(pdf_with_text_layer="true", need_header_footer_analysis=True))
- table_refs = [ann["value"] for ann in result["content"]["structure"]["subparagraphs"][0]["annotations"]
- if ann["name"] == "table"]
+ table_refs = [ann["value"] for ann in result["content"]["structure"]["subparagraphs"][0]["annotations"] if ann["name"] == "table"]
self.assertTrue(len(result["content"]["tables"]), len(table_refs))
for table in result["content"]["tables"]:
diff --git a/tests/api_tests/test_api_format_pptx.py b/tests/api_tests/test_api_format_pptx.py
index 14d0921a..2996d6d3 100644
--- a/tests/api_tests/test_api_format_pptx.py
+++ b/tests/api_tests/test_api_format_pptx.py
@@ -10,31 +10,31 @@ class TestApiPPTXReader(AbstractTestApiDocReader):
def test_pptx(self) -> None:
file_name = "example.pptx"
result = self._send_request(file_name, data=dict(structure_type="linear"))
- self.__check_content(result['content'])
+ self.__check_content(result["content"])
def test_ppt(self) -> None:
file_name = "example.ppt"
result = self._send_request(file_name, data=dict(structure_type="linear"))
- self.__check_content(result['content'])
+ self.__check_content(result["content"])
def test_odp(self) -> None:
file_name = "example.odp"
result = self._send_request(file_name, data=dict(structure_type="linear"))
- self.__check_content(result['content'])
+ self.__check_content(result["content"])
def __check_content(self, content: dict) -> None:
- subparagraphs = content['structure']['subparagraphs']
- self.assertEqual('A long time ago in a galaxy far far away ', subparagraphs[0]['text'])
- self.assertEqual('Example', subparagraphs[1]['text'])
- self.assertEqual('Some author', subparagraphs[2]['text'])
- self.assertEqual('This is simple table', subparagraphs[3]['text'])
-
- table = content['tables'][0]['cells']
- self.assertEqual('', table[0][0])
- self.assertEqual('Header1', table[0][1])
- self.assertEqual('Header2', table[0][2])
- self.assertEqual('Header3', table[0][3])
- self.assertEqual('Some content', table[1][0])
- self.assertEqual('A', table[1][1])
- self.assertEqual('B', table[1][2])
- self.assertEqual('C', table[1][3])
+ subparagraphs = content["structure"]["subparagraphs"]
+ self.assertEqual("A long time ago in a galaxy far far away ", subparagraphs[0]["text"])
+ self.assertEqual("Example", subparagraphs[1]["text"])
+ self.assertEqual("Some author", subparagraphs[2]["text"])
+ self.assertEqual("This is simple table", subparagraphs[3]["text"])
+
+ table = content["tables"][0]["cells"]
+ self.assertEqual("", table[0][0])
+ self.assertEqual("Header1", table[0][1])
+ self.assertEqual("Header2", table[0][2])
+ self.assertEqual("Header3", table[0][3])
+ self.assertEqual("Some content", table[1][0])
+ self.assertEqual("A", table[1][1])
+ self.assertEqual("B", table[1][2])
+ self.assertEqual("C", table[1][3])
diff --git a/tests/api_tests/test_api_format_txt.py b/tests/api_tests/test_api_format_txt.py
index f56c4842..3be5b0e4 100644
--- a/tests/api_tests/test_api_format_txt.py
+++ b/tests/api_tests/test_api_format_txt.py
@@ -16,33 +16,32 @@ def test_text(self) -> None:
file_name = "example.txt"
result = self._send_request(file_name, data={"structure_type": "tree"})
content = result["content"]["structure"]
- self.assertEqual(content["subparagraphs"][0]["text"].rstrip(), 'Пример документа')
+ self.assertEqual(content["subparagraphs"][0]["text"].rstrip(), "Пример документа")
- self._check_metainfo(result['metadata'], 'text/plain', file_name)
+ self._check_metainfo(result["metadata"], "text/plain", file_name)
def test_text_pretty_json(self) -> None:
file_name = "example.txt"
result = self._send_request(file_name, data={"structure_type": "tree", "return_format": "pretty_json"})
content = result["content"]["structure"]
- self.assertEqual(content["subparagraphs"][0]["text"].rstrip(), 'Пример документа')
+ self.assertEqual(content["subparagraphs"][0]["text"].rstrip(), "Пример документа")
- self._check_metainfo(result['metadata'], 'text/plain', file_name)
+ self._check_metainfo(result["metadata"], "text/plain", file_name)
def test_text_bad_return_format(self) -> None:
file_name = "example.txt"
result = self._send_request(file_name, data={"structure_type": "tree", "return_format": "broken"})
content = result["content"]["structure"]
- self.assertEqual(content["subparagraphs"][0]["text"].rstrip(), 'Пример документа')
+ self.assertEqual(content["subparagraphs"][0]["text"].rstrip(), "Пример документа")
- self._check_metainfo(result['metadata'], 'text/plain', file_name)
+ self._check_metainfo(result["metadata"], "text/plain", file_name)
def test_text2(self) -> None:
file_name = "pr_17.txt"
result = self._send_request(file_name, data={"structure_type": "tree"})
content = result["content"]["structure"]
self.assertIn("УТВЕРЖДЕНЫ", get_by_tree_path(content, "0.0")["text"])
- self.assertIn("1. Настоящие Требования разработаны в соответствии с Федеральным законом",
- get_by_tree_path(content, "0.1.0")["text"])
+ self.assertIn("1. Настоящие Требования разработаны в соответствии с Федеральным законом", get_by_tree_path(content, "0.1.0")["text"])
def test_special_symbols(self) -> None:
file_name = "special_symbol.txt"
@@ -69,7 +68,7 @@ def test_large_file(self) -> None:
content = result["content"]["structure"]["subparagraphs"][0]["text"]
for line_id, line in enumerate(content.split("\n")):
if line.strip() != "":
- self.assertEqual("Line number {:09d}".format(line_id), line)
+ self.assertEqual(f"Line number {line_id:09d}", line)
def test_txt_with_law(self) -> None:
file_name = "17 (1).txt"
@@ -97,7 +96,7 @@ def test_cp1251(self) -> None:
def __check_content(self, result: dict, encoding: str) -> None:
warning = result["warnings"][0]
- self.assertEqual(warning, "encoding is {}".format(encoding))
+ self.assertEqual(warning, f"encoding is {encoding}")
path = self._get_abs_path("utf8.txt")
with open(path) as file:
text = file.read()
@@ -109,17 +108,17 @@ def __check_football(self, content: dict) -> None:
self.assertTrue(node["text"].startswith(" Association football, more commonly known as simply"))
self.assertTrue(node["text"].endswith("The team with the higher number of goals wins the game.\n\n"))
annotations = node["annotations"]
- self.assertIn({'name': 'spacing', 'value': '50', 'start': 0, 'end': 546}, annotations)
+ self.assertIn({"name": "spacing", "value": "50", "start": 0, "end": 546}, annotations)
node = content[1]
self.assertTrue(node["text"].startswith(" Football is played in accordance with a set of rules known"))
self.assertTrue(node["text"].strip().endswith("the coin toss prior to kick-off or penalty kicks."))
annotations = node["annotations"]
- self.assertIn({'name': 'spacing', 'value': '100', 'start': 0, 'end': 163}, annotations)
+ self.assertIn({"name": "spacing", "value": "100", "start": 0, "end": 163}, annotations)
node = content[2]
self.assertTrue(node["text"].startswith(" Football is governed internationally by the International"))
self.assertTrue(node["text"].endswith("the 2019 FIFA Women's World Cup in France.\n\n"))
annotations = node["annotations"]
- self.assertIn({'name': 'spacing', 'value': '400', 'start': 0, 'end': 164}, annotations)
- self.assertIn({'name': 'spacing', 'value': '50', 'start': 164, 'end': 1068}, annotations)
+ self.assertIn({"name": "spacing", "value": "400", "start": 0, "end": 164}, annotations)
+ self.assertIn({"name": "spacing", "value": "50", "start": 164, "end": 1068}, annotations)
self.assertTrue(content[3]["text"].startswith(" The most prestigious competitions in European club"))
self.assertTrue(content[3]["text"].endswith("cost in excess of £600 million/€763 million/US$1.185 billion.\n"))
diff --git a/tests/api_tests/test_api_format_xml.py b/tests/api_tests/test_api_format_xml.py
index bbe80a1f..5810bfdc 100644
--- a/tests/api_tests/test_api_format_xml.py
+++ b/tests/api_tests/test_api_format_xml.py
@@ -12,6 +12,6 @@ def test_xml(self) -> None:
result = self._send_request(file_name, data={"structure_type": "linear"})
subparagraphs = result["content"]["structure"]["subparagraphs"]
self.assertEqual('\n', subparagraphs[0]["text"])
- self.assertEqual('N \n"
- "Фамилия \n"
- "Имя \n"
- "Организация \n"
- "Телефон \n"
- "Примечания \n"
+ 'N \n'
+ 'Фамилия \n'
+ 'Имя \n'
+ 'Организация \n'
+ 'Телефон \n'
+ 'Примечания \n'
"