diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..ff7c92ca --- /dev/null +++ b/.flake8 @@ -0,0 +1,26 @@ +[flake8] + +max-line-length = 160 +max-complexity = 13 +inline-quotes = " + +application-import-names = dedoc, tests +import-order-style = pycharm + +exclude = + .git, + __pycache__, + .idea, + .github, + *__init__.py, + resources, + dedoc/scripts, + examples, + docs, + venv, + build, + dedoc.egg-info + +# ANN101 - type annotations for self +ignore = + ANN101 diff --git a/.github/workflows/test_on_push.yaml b/.github/workflows/test_on_push.yaml index 9b2f1917..de74c6df 100644 --- a/.github/workflows/test_on_push.yaml +++ b/.github/workflows/test_on_push.yaml @@ -29,11 +29,11 @@ jobs: uses: actions/setup-python@v2 with: python-version: '3.8' - - name: Install dependencies + - name: Run lint run: | python3 -m pip install --upgrade pip - pip3 install pycodestyle==2.7.0 flake8==3.9.2 flake8-annotations==2.6.2 pyflakes==2.3.1 + pip3 install .[lint] + flake8 . - name: Run tests run: | - python3 -m unittest -v -f tests/test_style.py test="true" docker-compose up --build --exit-code-from test diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..330ca49d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,20 @@ +repos: +- repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + exclude: \.github|.*__init__\.py|resources|dedoc/scripts|examples|docs|venv|build|dedoc\.egg-info + args: + - "--config=.flake8" + additional_dependencies: [ + flake8-absolute-import==1.0.0.1, + flake8-annotations==2.9.1, + flake8-bugbear==23.3.12, + flake8-builtins==2.1.0, + flake8-import-order==0.18.2, + flake8-print==5.0.0, + flake8-quotes==3.3.2, + flake8-use-fstring==1.4, + pycodestyle==2.9.0, + pep8-naming==0.13.3 + ] diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index 463ab2c9..d17bfd6f 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -1,5 +1,5 @@ -# noqa from typing import Any, Optional + from fastapi import Body from pydantic import BaseModel @@ -36,68 +36,68 @@ class QueryParameters(BaseModel): def __init__(self, # type of document structure parsing - document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None), - structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None), - return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None), + document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None), # noqa + structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None), # noqa + return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None), # noqa # attachments handling - with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None), - need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None), - recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None), - return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None), - attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None), + with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None), # noqa + need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None), # noqa + recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None), # noqa + return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None), # noqa + attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None), # noqa # tables handling - insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None), - need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None), - table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None), - orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None), - orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None), + insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None), # noqa + need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None), # noqa + table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None), # noqa + orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None), # noqa + orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None), # noqa # pdf handling - pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None), - language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None), - pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None), - is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None), - document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None), - need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None), - need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None), + pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None), # noqa + language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None), # noqa + pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None), # noqa + is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None), # noqa + document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None), # noqa + need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None), # noqa + need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None), # noqa # other formats handling - delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None), - encoding: Optional[str] = Body(description="a document encoding", default=None), - html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None), - handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None), + delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None), # noqa + encoding: Optional[str] = Body(description="a document encoding", default=None), # noqa + html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None), # noqa + handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None), # noqa - **data: Any) -> None: + **data: Any) -> None: # noqa super().__init__(**data) - self.document_type: str = document_type or "" - self.structure_type: str = structure_type or 'tree' - self.return_format: str = return_format or 'json' - - self.with_attachments: str = with_attachments or 'false' - self.need_content_analysis: str = need_content_analysis or 'false' - self.recursion_deep_attachments: str = recursion_deep_attachments or '10' - self.return_base64: str = return_base64 or 'false' - self.attachments_dir: str = attachments_dir - - self.insert_table: str = insert_table or 'false' - self.need_pdf_table_analysis: str = need_pdf_table_analysis or 'true' - self.table_type: str = table_type or '' - self.orient_analysis_cells: str = orient_analysis_cells or 'false' - self.orient_cell_angle: str = orient_cell_angle or "90" - - self.pdf_with_text_layer: str = pdf_with_text_layer or 'auto_tabby' - self.language: str = language or "rus+eng" - self.pages: str = pages or ':' - self.is_one_column_document: str = is_one_column_document or 'auto' - self.document_orientation: str = document_orientation or "auto" - self.need_header_footer_analysis: str = need_header_footer_analysis or 'false' - self.need_binarization: str = need_binarization or 'false' - - self.delimiter: str = delimiter - self.encoding: str = encoding - self.html_fields: str = html_fields or '' - self.handle_invisible_table: str = handle_invisible_table or 'false' + self.document_type: str = document_type or "" + self.structure_type: str = structure_type or "tree" + self.return_format: str = return_format or "json" + + self.with_attachments: str = with_attachments or "false" + self.need_content_analysis: str = need_content_analysis or "false" + self.recursion_deep_attachments: str = recursion_deep_attachments or "10" + self.return_base64: str = return_base64 or "false" + self.attachments_dir: str = attachments_dir + + self.insert_table: str = insert_table or "false" + self.need_pdf_table_analysis: str = need_pdf_table_analysis or "true" + self.table_type: str = table_type or "" + self.orient_analysis_cells: str = orient_analysis_cells or "false" + self.orient_cell_angle: str = orient_cell_angle or "90" + + self.pdf_with_text_layer: str = pdf_with_text_layer or "auto_tabby" + self.language: str = language or "rus+eng" + self.pages: str = pages or ":" + self.is_one_column_document: str = is_one_column_document or "auto" + self.document_orientation: str = document_orientation or "auto" + self.need_header_footer_analysis: str = need_header_footer_analysis or "false" + self.need_binarization: str = need_binarization or "false" + + self.delimiter: str = delimiter + self.encoding: str = encoding + self.html_fields: str = html_fields or "" + self.handle_invisible_table: str = handle_invisible_table or "false" diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py index 1f289ca7..bd765535 100644 --- a/dedoc/api/api_utils.py +++ b/dedoc/api/api_utils.py @@ -1,4 +1,4 @@ -from typing import Optional, List, Dict, Iterator, Set +from typing import Dict, Iterator, List, Optional, Set from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation @@ -24,7 +24,7 @@ def __prettify_text(text: str) -> Iterator[str]: yield " ".join(res) -def _node2tree(paragraph: 'TreeNode', depth: int, depths: Set[int] = None) -> str: +def _node2tree(paragraph: TreeNode, depth: int, depths: Set[int] = None) -> str: if depths is None: depths = set() @@ -33,31 +33,29 @@ def _node2tree(paragraph: 'TreeNode', depth: int, depths: Set[int] = None) -> st space = "".join(space) node_result = [] - node_result.append(" {} {} ".format( - space, paragraph.metadata.hierarchy_level.line_type + " " + paragraph.node_id)) + node_result.append(f" {space} {paragraph.metadata.hierarchy_level.line_type} {paragraph.node_id} ") for text in __prettify_text(paragraph.text): space = [space_symbol] * 4 * (depth - 1) + 4 * [space_symbol] space = "".join(space) - node_result.append("

{} {}

".format(space, text)) + node_result.append(f"

{space} {text}

") if len(paragraph.subparagraphs) > 0: - sub_nodes = "\n".join([_node2tree(sub_node, depth=depth + 1, depths=depths.union({depth})) - for sub_node in paragraph.subparagraphs]) - return """ + sub_nodes = "\n".join([_node2tree(sub_node, depth=depth + 1, depths=depths.union({depth})) for sub_node in paragraph.subparagraphs]) + return f"""
- {} - {} + {"".join(node_result)} + {sub_nodes}
- """.format("".join(node_result), sub_nodes) + """ else: - return """ + return f"""

- {} + {"".join(node_result)}

- """.format("".join(node_result)) + """ -def json2collapsed_tree(paragraph: 'TreeNode') -> str: - result = """ +def json2collapsed_tree(paragraph: TreeNode) -> str: + result = f""" @@ -66,15 +64,15 @@ def json2collapsed_tree(paragraph: 'TreeNode') -> str: - {} + {_node2tree(paragraph, depth=0)} - """.format(_node2tree(paragraph, depth=0)) + """ return result -def json2tree(paragraph: 'TreeNode') -> str: +def json2tree(paragraph: TreeNode) -> str: stack = [paragraph] nodes = [] while len(stack) > 0: @@ -94,14 +92,13 @@ def json2tree(paragraph: 'TreeNode') -> str: depths = {d for d in depths if d <= depth} space = [space_symbol] * 4 * (depth - 1) + 4 * ["-"] space = __add_vertical_line(depths, space) - node_result.append("

{} {}

".format( - space, node.metadata.hierarchy_level.line_type + " " + node.node_id)) + node_result.append(f"

{space} {node.metadata.hierarchy_level.line_type} {node.node_id}

") for text in __prettify_text(node.text): space = [space_symbol] * 4 * (depth - 1) + 4 * [space_symbol] space = __add_vertical_line(depths, space) - node_result.append("

{} {}

".format(space, text)) + node_result.append(f"

{space} {text}

") result.extend(reversed(node_result)) - result.append("

{}

".format(root.text)) + result.append(f"

{root.text}

") return "".join(reversed(result)) @@ -111,11 +108,7 @@ def __add_vertical_line(depths: Set[int], space: List[str]) -> str: return "".join(space) -def json2html(text: str, - paragraph: 'TreeNode', - tables: Optional[List[Table]], - tabs: int = 0, - table2id: Dict[str, int] = None) -> str: +def json2html(text: str, paragraph: TreeNode, tables: Optional[List[Table]], tabs: int = 0, table2id: Dict[str, int] = None) -> str: if tables is None: tables = [] @@ -125,18 +118,13 @@ def json2html(text: str, ptext = __annotations2html(paragraph, table2id) if paragraph.metadata.hierarchy_level.line_type in [HierarchyLevel.header, HierarchyLevel.root]: - ptext = "{}".format(ptext.strip()) + ptext = f"{ptext.strip()}" elif paragraph.metadata.hierarchy_level.line_type == HierarchyLevel.list_item: - ptext = "{}".format(ptext.strip()) + ptext = f"{ptext.strip()}" else: ptext = ptext.strip() - text += "

{tab} {text} id = {id} ; type = {type}

".format( - tab=" " * tabs, - text=ptext, - type=str(paragraph.metadata.hierarchy_level.line_type), - id=paragraph.node_id - ) + text += f'

{" " * tabs} {ptext} id = {paragraph.node_id} ; type = {paragraph.metadata.hierarchy_level.line_type}

' for subparagraph in paragraph.subparagraphs: text = json2html(text=text, paragraph=subparagraph, tables=None, tabs=tabs + 4, table2id=table2id) @@ -175,7 +163,7 @@ def __value2tag(name: str, value: str) -> str: return value -def __annotations2html(paragraph: 'TreeNode', table2id: Dict[str, int]) -> str: +def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int]) -> str: indexes = dict() for annotation in paragraph.annotations: @@ -198,8 +186,7 @@ def __annotations2html(paragraph: 'TreeNode', table2id: Dict[str, int]) -> str: indexes.setdefault(annotation.start, "") indexes.setdefault(annotation.end, "") if name == "table": - indexes[annotation.start] += '

table#{index_table}

' \ - .format(uid=tag, index_table=table2id[tag]) + indexes[annotation.start] += f'

table#{table2id[tag]}

' else: indexes[annotation.start] += "<" + tag + ">" indexes[annotation.end] = "" + indexes[annotation.end] @@ -215,10 +202,8 @@ def __annotations2html(paragraph: 'TreeNode', table2id: Dict[str, int]) -> str: def __table2html(table: Table, table2id: Dict[str, int]) -> str: uid = table.metadata.uid - text = "

table {}:

".format(table2id[uid]) - text += '\n\n'.format( - uid=uid - ) + text = f"

table {table2id[uid]}:

" + text += f'
\n\n' cell_properties = table.metadata.cell_properties for row_id, row in enumerate(table.cells): text += "\n" @@ -227,10 +212,10 @@ def __table2html(table: Table, table2id: Dict[str, int]) -> str: if cell_properties: prop = cell_properties[row_id][col_id] if prop.invisible: - text += " style=\"display: none\" " - text += " colspan=\"{}\" rowspan=\"{}\">{}\n".format(prop.colspan, prop.rowspan, cell) + text += ' style="display: none" ' + text += f' colspan="{prop.colspan}" rowspan="{prop.rowspan}">{cell}\n' else: text += f">{cell}\n" text += "\n" - text += '\n
' + text += "\n" return text diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py index ad62a2fa..370a3cc1 100644 --- a/dedoc/api/dedoc_api.py +++ b/dedoc/api/dedoc_api.py @@ -3,16 +3,16 @@ import tempfile import uvicorn -from fastapi import Response, FastAPI, Request, Depends, UploadFile, File -from fastapi.responses import UJSONResponse, ORJSONResponse +from fastapi import Depends, FastAPI, File, Request, Response, UploadFile +from fastapi.responses import ORJSONResponse, UJSONResponse from fastapi.staticfiles import StaticFiles from starlette.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse import dedoc from dedoc.api.api_args import QueryParameters -from dedoc.api.api_utils import json2html, json2tree, json2collapsed_tree -from dedoc.common.exceptions.dedoc_exception import DedocException -from dedoc.common.exceptions.missing_file_exception import MissingFileException +from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree +from dedoc.common.exceptions.dedoc_error import DedocError +from dedoc.common.exceptions.missing_file_error import MissingFileError from dedoc.config import get_config from dedoc.dedoc_manager import DedocManager from dedoc.utils.utils import save_upload_file @@ -23,9 +23,9 @@ static_files_dirs = config.get("static_files_dirs") app = FastAPI() -app.mount('/static', StaticFiles(directory=config.get("static_path", static_path)), name="static") +app.mount("/static", StaticFiles(directory=config.get("static_path", static_path)), name="static") -module_api_args = importlib.import_module(config['import_path_init_api_args']) +module_api_args = importlib.import_module(config["import_path_init_api_args"]) logger = config["logger"] manager = DedocManager(config=config) @@ -33,13 +33,13 @@ @app.get("/") def get_info() -> Response: """ - Root URL '/' is need start with simple Flask before rest-plus. API otherwise you will get 404 Error. + Root URL "/" is need start with simple Flask before rest-plus. API otherwise you will get 404 Error. It is bug of rest-plus lib. """ - return FileResponse(os.path.join(static_path, 'html_eng/info.html')) + return FileResponse(os.path.join(static_path, "html_eng/info.html")) -@app.get('/static_file') +@app.get("/static_file") def get_static_file(request: Request) -> Response: path = _get_static_file_path(request) # TODO check as_attachment @@ -47,7 +47,7 @@ def get_static_file(request: Request) -> Response: return FileResponse(path) -@app.get('/version') +@app.get("/version") def get_version() -> Response: return PlainTextResponse(dedoc.__version__) @@ -55,17 +55,16 @@ def get_version() -> Response: def _get_static_file_path(request: Request) -> str: file = request.query_params.get("fname") directory_name = request.query_params.get("directory") - directory = static_files_dirs[ - directory_name] if directory_name is not None and directory_name in static_files_dirs else static_path + directory = static_files_dirs[directory_name] if directory_name is not None and directory_name in static_files_dirs else static_path return os.path.abspath(os.path.join(directory, file)) -@app.post('/upload') -async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: +@app.post("/upload") +async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: # noqa parameters = query_params.dict(by_alias=True) if not file or file.filename == "": - raise MissingFileException("Error: Missing content in request_post file parameter", version=dedoc.__version__) + raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.__version__) # check if the post request_post has the file part logger.info(f"Get file {file.filename} with parameters {parameters}") @@ -90,8 +89,8 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D return ORJSONResponse(content=document_tree.to_dict(), status_code=200) -@app.exception_handler(DedocException) -async def exception_handler(request: Request, exc: DedocException) -> Response: +@app.exception_handler(DedocError) +async def exception_handler(request: Request, exc: DedocError) -> Response: result = {"message": exc.msg} if exc.filename: result["file_name"] = exc.filename diff --git a/dedoc/api/models/custom_fields.py b/dedoc/api/models/custom_fields.py index 36b37e49..b3796873 100644 --- a/dedoc/api/models/custom_fields.py +++ b/dedoc/api/models/custom_fields.py @@ -1,4 +1,4 @@ -from typing import TypeVar, Optional +from typing import Optional, TypeVar from weakref import WeakSet from flask_restx import fields @@ -7,17 +7,17 @@ class AnyNotNullField(fields.Raw): - __schema_type__ = 'any' + __schema_type__ = "any" - def format(self, value: T) -> Optional[T]: # NOQA + def format(self, value: T) -> Optional[T]: # noqa if not isinstance(value, WeakSet): return value class ForbiddenField(fields.Raw): - __schema_type__ = 'any' + __schema_type__ = "any" - def format(self, value: T) -> None: # NOQA + def format(self, value: T) -> None: # noqa return diff --git a/dedoc/api/train_dataset/api_args.py b/dedoc/api/train_dataset/api_args.py index 4a4593fa..53eb0d2c 100644 --- a/dedoc/api/train_dataset/api_args.py +++ b/dedoc/api/train_dataset/api_args.py @@ -1,4 +1,4 @@ -from typing import Any, Optional +from typing import Optional from fastapi import Body @@ -10,21 +10,21 @@ class TrainDatasetParameters(QueryParameters): task_size: Optional[str] def __init__(self, - type_of_task: Optional[str] = Body(description="Type of the task to create", default=None), - task_size: Optional[str] = Body(description="Maximum number of images in one task", default=None), + type_of_task: Optional[str] = Body(description="Type of the task to create", default=None), # noqa + task_size: Optional[str] = Body(description="Maximum number of images in one task", default=None), # noqa - document_type: Optional[str] = Body(default=None), - pdf_with_text_layer: Optional[str] = Body(default=None), - language: Optional[str] = Body(default=None), - need_header_footer_analysis: Optional[str] = Body(default=None), + document_type: Optional[str] = Body(default=None), # noqa + pdf_with_text_layer: Optional[str] = Body(default=None), # noqa + language: Optional[str] = Body(default=None), # noqa + need_header_footer_analysis: Optional[str] = Body(default=None), # noqa - **data: Any) -> None: + **data: dict) -> None: super().__init__(**data) self.type_of_task: str = type_of_task or "" self.task_size: str = task_size or "250" self.document_type = document_type or "" - self.pdf_with_text_layer = pdf_with_text_layer or 'auto' + self.pdf_with_text_layer = pdf_with_text_layer or "auto" self.language = language or "rus+eng" - self.need_header_footer_analysis = need_header_footer_analysis or 'false' + self.need_header_footer_analysis = need_header_footer_analysis or "false" diff --git a/dedoc/api/train_dataset/api_collect_train_dataset.py b/dedoc/api/train_dataset/api_collect_train_dataset.py index 96d44848..593cb005 100644 --- a/dedoc/api/train_dataset/api_collect_train_dataset.py +++ b/dedoc/api/train_dataset/api_collect_train_dataset.py @@ -3,7 +3,7 @@ import shutil import uvicorn -from fastapi import FastAPI, Response, Request, UploadFile, File, Depends +from fastapi import Depends, FastAPI, File, Request, Response, UploadFile from fastapi.staticfiles import StaticFiles from starlette.responses import FileResponse, HTMLResponse from starlette.templating import Jinja2Templates @@ -30,7 +30,7 @@ logger = config.get("logger", logging.getLogger()) app = FastAPI() -app.mount('/static', StaticFiles(directory=static_path), name="static") +app.mount("/static", StaticFiles(directory=static_path), name="static") templates = Jinja2Templates(directory=os.path.join(static_path, "train_dataset")) manager = DedocManager(config=config) @@ -84,7 +84,7 @@ config_path=os.path.join(train_resources_path, "law", "config.json"), tmp_dir=UPLOAD_FOLDER, progress_bar=progress_bar, - item2label=lambda t: label2label_law.get(t['_metadata']['hierarchy_level']['line_type'], "raw_text"), + item2label=lambda t: label2label_law.get(t["_metadata"]["hierarchy_level"]["line_type"], "raw_text"), config=config), "paragraph_classifier": LineLabelTasker( path2bboxes=boxes_path, @@ -104,7 +104,7 @@ config_path=os.path.join(train_resources_path, "tz", "config.json"), tmp_dir=UPLOAD_FOLDER, progress_bar=progress_bar, - item2label=lambda t: label2label_tz.get(t['_metadata']['hierarchy_level']['line_type'], "raw_text"), + item2label=lambda t: label2label_tz.get(t["_metadata"]["hierarchy_level"]["line_type"], "raw_text"), config=config), "diploma_classifier": FilteredLineLabelTasker( path2bboxes=boxes_path, @@ -114,7 +114,7 @@ config_path=os.path.join(train_resources_path, "diploma", "config.json"), tmp_dir=UPLOAD_FOLDER, progress_bar=progress_bar, - item2label=lambda t: label2label_diploma.get(t['_metadata']['hierarchy_level']['line_type'], "raw_text"), + item2label=lambda t: label2label_diploma.get(t["_metadata"]["hierarchy_level"]["line_type"], "raw_text"), config=config), "header_classifier": HeaderFooterTasker( path2bboxes=boxes_path, @@ -139,12 +139,12 @@ handler = AsyncHandler(tasker=tasker, manager=manager, config=config) -@app.get('/') +@app.get("/") def get_info() -> Response: """ Returns the main page for the labeling mode. """ - return FileResponse(os.path.join(static_path, 'train_dataset/info_labeling_mode.html')) + return FileResponse(os.path.join(static_path, "train_dataset/info_labeling_mode.html")) @app.get("/handle_archive") @@ -152,11 +152,11 @@ def handle_archive() -> Response: """ Returns the page for running the whole pipeline of task making. """ - return FileResponse(os.path.join(static_path, 'train_dataset/form_input_archive.html')) + return FileResponse(os.path.join(static_path, "train_dataset/form_input_archive.html")) -@app.post('/upload_archive') -def upload_archive(file: UploadFile = File(...), query_params: TrainDatasetParameters = Depends()) -> Response: +@app.post("/upload_archive") +def upload_archive(file: UploadFile = File(...), query_params: TrainDatasetParameters = Depends()) -> Response: # noqa """ Run the whole pipeline of task making. """ @@ -169,7 +169,7 @@ def upload_archive(file: UploadFile = File(...), query_params: TrainDatasetParam ) -@app.get('/get_result_archive') +@app.get("/get_result_archive") def get_result_archive(request: Request, uid: str) -> Response: """ Get the archive with the result tasks. @@ -180,33 +180,33 @@ def get_result_archive(request: Request, uid: str) -> Response: path_out = os.path.join(UPLOAD_FOLDER, file) shutil.move(handler.get_results(uid), path_out) hash_sum = calculate_file_hash(path=path_out) - logger.info("md5sum {}".format(hash_sum)) + logger.info(f"md5sum {hash_sum}") return templates.TemplateResponse("download.html", dict(request=request, value=file, cnt_per_one=1, hash_sum=hash_sum, filename=file)) else: response = "

Ещё не готово

" for line in handler.get_progress(uid).split("\n"): - response += "

{}

".format(line) + response += f"

{line}

" return HTMLResponse(response, status_code=202) -@app.get('/info_classifiers') +@app.get("/info_classifiers") def get_classifiers_info() -> Response: return FileResponse(os.path.join(static_path, "train_dataset/refit_classifier.html")) -@app.get('/static_file') +@app.get("/static_file") def get_static_file(request: Request) -> Response: path = _get_static_file_path(request) return FileResponse(path) -@app.get('/return-file/{filename}') +@app.get("/return-file/{filename}") def return_files(filename: str) -> Response: file_path = os.path.join(UPLOAD_FOLDER, filename) return FileResponse(file_path) -@app.get('/clear') +@app.get("/clear") def clear() -> Response: shutil.rmtree(config["intermediate_data_path"]) os.makedirs(config["intermediate_data_path"]) diff --git a/dedoc/api/train_dataset/async_archive_handler.py b/dedoc/api/train_dataset/async_archive_handler.py index 43b63b1a..09b6d39d 100644 --- a/dedoc/api/train_dataset/async_archive_handler.py +++ b/dedoc/api/train_dataset/async_archive_handler.py @@ -10,7 +10,7 @@ from fastapi import UploadFile -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException +from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.dedoc_manager import DedocManager from dedoc.train_dataset.taskers.tasker import Tasker @@ -35,19 +35,15 @@ def run(self) -> None: else: uid, parameters, file = self.queue.get() self.results[uid] = self._handle_archive(path=file, parameters=parameters, uid=uid) - self.logger.info("FINISH {}".format(uid)) + self.logger.info(f"FINISH {uid}") def _handle_archive(self, uid: str, path: str, parameters: dict) -> str: try: with zipfile.ZipFile(path, "r") as archive: for i, file in enumerate(archive.namelist()): - self.progress[uid] = "files done\t= {} \n files_in_progress\t= {}\n total\t= {}".format( - i, 1, len(archive.namelist()) - ) + self.progress[uid] = f"files done\t= {i} \n files_in_progress\t= {1}\n total\t= {len(archive.namelist())}" self.__handle_one_file(archive, file, parameters) - self.progress[uid] = "files done\t= {} \n files_in_progress\t= {}\n total\t= {}".format( - i + 1, 0, len(archive.namelist()) - ) + self.progress[uid] = f"files done\t= {i + 1} \n files_in_progress\t= {0}\n total\t= {len(archive.namelist())}" task, _ = self.tasker.create_tasks( type_of_task=parameters["type_of_task"], @@ -56,11 +52,11 @@ def _handle_archive(self, uid: str, path: str, parameters: dict) -> str: ) return task except Exception as e: - self.progress[uid] = "Fail with\n {}".format(e) + self.progress[uid] = f"Fail with\n{e}" raise e def __handle_one_file(self, archive: zipfile.ZipFile, file: str, parameters: dict) -> None: - self.logger.info("Start handle {}".format(file)) + self.logger.info(f"Start handle {file}") with TemporaryDirectory() as tmpdir: try: with archive.open(file) as item: @@ -71,9 +67,9 @@ def __handle_one_file(self, archive: zipfile.ZipFile, file: str, parameters: dic with open(path_out, "wb") as file_out: file_out.write(item.read()) self.manager.parse(file_path=path_out, parameters=parameters) - except BadFileFormatException as e: - self.logger.warning("Can't handle file {}, exception {}".format(file, str(e))) - self.logger.info("Finish handle {}".format(file)) + except BadFileFormatError as e: + self.logger.warning(f"Can't handle file {file}, exception {str(e)}") + self.logger.info(f"Finish handle {file}") class AsyncHandler: diff --git a/dedoc/attachments_extractors/abstract_attachment_extractor.py b/dedoc/attachments_extractors/abstract_attachment_extractor.py index 2c5d5643..55fb06e9 100644 --- a/dedoc/attachments_extractors/abstract_attachment_extractor.py +++ b/dedoc/attachments_extractors/abstract_attachment_extractor.py @@ -1,7 +1,7 @@ import os import uuid from abc import ABC, abstractmethod -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple from dedoc.data_structures.attached_file import AttachedFile from dedoc.utils.utils import save_data_to_unique_file @@ -48,10 +48,7 @@ def with_attachments(parameters: dict) -> bool: """ return str(parameters.get("with_attachments", "false")).lower() == "true" - def _content2attach_file(self, - content: List[Tuple[str, bytes]], - tmpdir: str, - need_content_analysis: bool) -> List[AttachedFile]: + def _content2attach_file(self, content: List[Tuple[str, bytes]], tmpdir: str, need_content_analysis: bool) -> List[AttachedFile]: attachments = [] for original_name, contents in content: tmp_file_name = save_data_to_unique_file(directory=tmpdir, filename=original_name, binary_data=contents) diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py index 171a233c..15f4adcb 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py @@ -63,7 +63,7 @@ def __parse_ole_contents(self, stream: bytes) -> Tuple[str, bytes]: def _get_attachments(self, tmpdir: str, filename: str, parameters: dict, attachments_dir: str) -> List[AttachedFile]: result = [] - with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile: + with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile: files = zfile.namelist() attachments = [file for file in files if file.startswith((f"{attachments_dir}/media/", f"{attachments_dir}/embeddings/"))] @@ -71,10 +71,10 @@ def _get_attachments(self, tmpdir: str, filename: str, parameters: dict, attachm original_name = os.path.split(attachment)[-1] # these are windows metafile extensions - if original_name.endswith(('.emf', 'wmf')): + if original_name.endswith((".emf", "wmf")): continue - if not original_name.endswith('.bin'): + if not original_name.endswith(".bin"): result.append((original_name, zfile.read(attachment))) else: with zfile.open(attachment) as f: @@ -82,9 +82,9 @@ def _get_attachments(self, tmpdir: str, filename: str, parameters: dict, attachm # extracting PDF-files if ole.exists("CONTENTS"): - data = ole.openstream('CONTENTS').read() - if data[0:5] == b'%PDF-': - result.append((os.path.splitext(original_name)[-2] + '.pdf', data)) + data = ole.openstream("CONTENTS").read() + if data[0:5] == b"%PDF-": + result.append((f"{os.path.splitext(original_name)[-2]}.pdf", data)) # extracting files in other formats elif ole.exists("\x01Ole10Native"): diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py index 93959faf..5c9be9c9 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py @@ -8,7 +8,7 @@ from bs4 import BeautifulSoup, Tag from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException +from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.attached_file import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes @@ -32,7 +32,7 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ """ result = [] try: - with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile: + with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile: diagram_attachments = self.__extract_diagrams(zfile) need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true" result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis) @@ -40,7 +40,7 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word") except zipfile.BadZipFile: - raise BadFileFormatException("Bad docx file:\n file_name = {}. Seems docx is broken".format(filename)) + raise BadFileFormatError(f"Bad docx file:\n file_name = {filename}. Seems docx is broken") return result def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]: @@ -52,12 +52,12 @@ def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]: """ result = [] try: - content = document.read('word/document.xml') + content = document.read("word/document.xml") except KeyError: - content = document.read('word/document2.xml') + content = document.read("word/document2.xml") content = re.sub(br"\n[\t ]*", b"", content) - bs = BeautifulSoup(content, 'xml') + bs = BeautifulSoup(content, "xml") paragraphs = [p for p in bs.body] diagram_paragraphs = [] @@ -81,10 +81,10 @@ def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]: paragraph = p.extract() uid = hashlib.md5(paragraph.encode()).hexdigest() - with open(f'{tmpdir}/word/document.xml', 'w') as f: + with open(f"{tmpdir}/word/document.xml", "w") as f: f.write(doc_text) diagram_name = f"{uid}.docx" - with zipfile.ZipFile(os.path.join(tmpdir, diagram_name), mode='w') as new_d: + with zipfile.ZipFile(os.path.join(tmpdir, diagram_name), mode="w") as new_d: for filename in namelist: new_d.write(os.path.join(tmpdir, filename), arcname=filename) with open(os.path.join(tmpdir, diagram_name), "rb") as f: diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py index ff9507c4..bbcf1953 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py @@ -25,7 +25,7 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ """ attachments = [] name, ext = splitext_(filename) - if ext.lower() != '.xlsx': + if ext.lower() != ".xlsx": return attachments return self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="xl") diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py index bd6767e7..706e34c4 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py @@ -1,5 +1,5 @@ -import os import json +import os from typing import List, Optional from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor @@ -42,17 +42,17 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ for keys in field_keys: path = json.dumps(keys, ensure_ascii=False) - attached_filename = path + '.html' + attached_filename = f"{path}.html" attachment_file_path = os.path.join(tmpdir, attached_filename) field_content = self.__get_value_by_keys(data, keys) if not isinstance(field_content, str): continue - with open(attachment_file_path, 'w') as f: + with open(attachment_file_path, "w") as f: f.write(field_content) - with open(attachment_file_path, mode='rb') as f: + with open(attachment_file_path, mode="rb") as f: binary_data = f.read() attachments.append((attached_filename, binary_data)) diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py index 35127a4f..9cc35b6e 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py @@ -1,17 +1,17 @@ import logging import os import uuid -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple import PyPDF2 from PyPDF2.pdf import PageObject from PyPDF2.utils import PdfReadError from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor +from dedoc.attachments_extractors.utils import create_note from dedoc.data_structures.attached_file import AttachedFile from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.utils.utils import convert_datetime -from dedoc.attachments_extractors.utils import create_note class PDFAttachmentsExtractor(AbstractAttachmentsExtractor): @@ -38,7 +38,7 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \ the methods' parameters. """ - with open(os.path.join(tmpdir, filename), 'rb') as handler: + with open(os.path.join(tmpdir, filename), "rb") as handler: try: reader = PyPDF2.PdfFileReader(handler) except Exception as e: @@ -59,25 +59,22 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]: attachments = [] - if '/Annots' in page.keys(): - for annot in page['/Annots']: + if "/Annots" in page.keys(): + for annot in page["/Annots"]: # Other subtypes, such as /Link, cause errors - subtype = annot.getObject().get('/Subtype') + subtype = annot.getObject().get("/Subtype") if subtype == "/FileAttachment": - name = annot.getObject()['/FS']['/UF'] - data = annot.getObject()['/FS']['/EF']['/F'].getData() # The file containing the stream data. + name = annot.getObject()["/FS"]["/UF"] + data = annot.getObject()["/FS"]["/EF"]["/F"].getData() # The file containing the stream data. attachments.append([name, data]) - if subtype == "/Text" and annot.getObject().get('/Name') == '/Comment': # it is messages (notes) in PDF + if subtype == "/Text" and annot.getObject().get("/Name") == "/Comment": # it is messages (notes) in PDF note = annot.getObject() created_time = convert_datetime(note["/CreationDate"]) if "/CreationDate" in note else None modified_time = convert_datetime(note["/M"]) if "/M" in note else None user = note.get("/T") data = note.get("/Contents", "") - name, content = create_note(content=data, - modified_time=modified_time, - created_time=created_time, - author=user) + name, content = create_note(content=data, modified_time=modified_time, created_time=created_time, author=user) attachments.append((name, bytes(content))) return attachments @@ -99,15 +96,15 @@ def __get_root_attachments(self, reader: PyPDF2.PdfFileReader) -> List[Tuple[str """ attachments = [] catalog = reader.trailer["/Root"] - if '/Names' in catalog.keys() and '/EmbeddedFiles' in catalog['/Names'].keys() and '/Names' in catalog['/Names']['/EmbeddedFiles'].keys(): - file_names = catalog['/Names']['/EmbeddedFiles']['/Names'] + if "/Names" in catalog.keys() and "/EmbeddedFiles" in catalog["/Names"].keys() and "/Names" in catalog["/Names"]["/EmbeddedFiles"].keys(): + file_names = catalog["/Names"]["/EmbeddedFiles"]["/Names"] for f in file_names: if isinstance(f, str): data_index = file_names.index(f) + 1 dict_object = file_names[data_index].getObject() - if '/EF' in dict_object and '/F' in dict_object['/EF']: - data = dict_object['/EF']['/F'].getData() - name = dict_object.get('/UF', "pdf_attach_{}".format(uuid.uuid1())) + if "/EF" in dict_object and "/F" in dict_object["/EF"]: + data = dict_object["/EF"]["/F"].getData() + name = dict_object.get("/UF", f"pdf_attach_{uuid.uuid1()}") attachments.append((name, data)) return attachments diff --git a/dedoc/attachments_extractors/utils.py b/dedoc/attachments_extractors/utils.py index acc64fff..679677e9 100644 --- a/dedoc/attachments_extractors/utils.py +++ b/dedoc/attachments_extractors/utils.py @@ -10,6 +10,6 @@ def create_note(content: str, modified_time: int, created_time: int, author: str "created_time": created_time, "size": size if size else len(content), "author": author} - encode_data = json.dumps(note_dict).encode('utf-8') + encode_data = json.dumps(note_dict).encode("utf-8") return filename, encode_data diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py index 8dcfce0a..1392277a 100644 --- a/dedoc/attachments_handler/attachments_handler.py +++ b/dedoc/attachments_handler/attachments_handler.py @@ -7,8 +7,8 @@ from typing import List from dedoc.attachments_extractors import AbstractAttachmentsExtractor -from dedoc.common.exceptions.dedoc_exception import DedocException -from dedoc.data_structures import ParsedDocument, DocumentMetadata, AttachedFile +from dedoc.common.exceptions.dedoc_error import DedocError +from dedoc.data_structures import AttachedFile, DocumentMetadata, ParsedDocument from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.utils.utils import get_empty_content @@ -73,7 +73,7 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct parsed_file = document_parser.parse(attachment_path, parameters=parameters_copy) else: parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy) - except DedocException: + except DedocError: # return empty ParsedDocument with Meta information parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy) diff --git a/dedoc/common/exceptions/bad_file_error.py b/dedoc/common/exceptions/bad_file_error.py new file mode 100644 index 00000000..4b800c9d --- /dev/null +++ b/dedoc/common/exceptions/bad_file_error.py @@ -0,0 +1,19 @@ +from typing import Optional + +from dedoc.common.exceptions.dedoc_error import DedocError + + +class BadFileFormatError(DedocError): + """ + Raise if given file can't be handled by the system (for example if no reader can read this file) + """ + + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: + super(BadFileFormatError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + + def __str__(self) -> str: + return f"BadFileFormatError({self.msg})" + + @property + def code(self) -> int: + return 415 diff --git a/dedoc/common/exceptions/bad_file_exception.py b/dedoc/common/exceptions/bad_file_exception.py deleted file mode 100644 index 0aeea0e1..00000000 --- a/dedoc/common/exceptions/bad_file_exception.py +++ /dev/null @@ -1,23 +0,0 @@ -from typing import Optional - -from dedoc.common.exceptions.dedoc_exception import DedocException - - -class BadFileFormatException(DedocException): - """ - Raise if given file can't be handled by the system (for example if no reader can read this file) - """ - - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: - super(BadFileFormatException, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return "BadFileException({})".format(self.msg) - - @property - def code(self) -> int: - return 415 diff --git a/dedoc/common/exceptions/bad_parameters_error.py b/dedoc/common/exceptions/bad_parameters_error.py new file mode 100644 index 00000000..dc8c0aa9 --- /dev/null +++ b/dedoc/common/exceptions/bad_parameters_error.py @@ -0,0 +1,20 @@ +from typing import Optional + +from dedoc.common.exceptions.dedoc_error import DedocError + + +class BadParametersError(DedocError): + """ + Raise if given parameters are incorrect and can't be handled by the system + (for example if string provided instead of bool) + """ + + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: + super(BadParametersError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + + def __str__(self) -> str: + return f"BadParametersError({self.msg})" + + @property + def code(self) -> int: + return 400 diff --git a/dedoc/common/exceptions/bad_parameters_exception.py b/dedoc/common/exceptions/bad_parameters_exception.py deleted file mode 100644 index 626d5d82..00000000 --- a/dedoc/common/exceptions/bad_parameters_exception.py +++ /dev/null @@ -1,24 +0,0 @@ -from typing import Optional - -from dedoc.common.exceptions.dedoc_exception import DedocException - - -class BadParametersException(DedocException): - """ - Raise if given parameters are incorrect and can't be handled by the system - (for example if string provided instead of bool) - """ - - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: - super(BadParametersException, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return "BadParametersException({})".format(self.msg) - - @property - def code(self) -> int: - return 400 diff --git a/dedoc/common/exceptions/conversion_error.py b/dedoc/common/exceptions/conversion_error.py new file mode 100644 index 00000000..f95207b3 --- /dev/null +++ b/dedoc/common/exceptions/conversion_error.py @@ -0,0 +1,19 @@ +from typing import Optional + +from dedoc.common.exceptions.dedoc_error import DedocError + + +class ConversionError(DedocError): + """ + Can be raised if conversion of the file ended unsuccessfully or didn't finish at all (converter terminated the process) + """ + + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: + super(ConversionError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + + def __str__(self) -> str: + return f"ConversionError({self.msg})" + + @property + def code(self) -> int: + return 415 diff --git a/dedoc/common/exceptions/conversion_exception.py b/dedoc/common/exceptions/conversion_exception.py deleted file mode 100644 index b71b9f5a..00000000 --- a/dedoc/common/exceptions/conversion_exception.py +++ /dev/null @@ -1,23 +0,0 @@ -from typing import Optional - -from dedoc.common.exceptions.dedoc_exception import DedocException - - -class ConversionException(DedocException): - """ - Can be raised if conversion of the file ended unsuccessfully or didn't finish at all (converter terminated the process) - """ - - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: - super(ConversionException, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return f"ConversionException({self.msg})" - - @property - def code(self) -> int: - return 415 diff --git a/dedoc/common/exceptions/dedoc_exception.py b/dedoc/common/exceptions/dedoc_error.py similarity index 81% rename from dedoc/common/exceptions/dedoc_exception.py rename to dedoc/common/exceptions/dedoc_error.py index 1b9cbf5d..78426e39 100644 --- a/dedoc/common/exceptions/dedoc_exception.py +++ b/dedoc/common/exceptions/dedoc_error.py @@ -3,14 +3,14 @@ import dedoc -class DedocException(Exception): +class DedocError(Exception): def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None, metadata: Optional[dict] = None) -> None: - super(DedocException, self).__init__() + super(DedocError, self).__init__() self.msg = msg self.msg_api = msg if msg_api is None else msg_api self.filename = filename @@ -18,7 +18,7 @@ def __init__(self, self.metadata = metadata def __str__(self) -> str: - return "MissingFileException({})".format(self.msg) + return f"DedocError({self.msg})" @property def code(self) -> int: diff --git a/dedoc/common/exceptions/java_not_found_error.py b/dedoc/common/exceptions/java_not_found_error.py index 62b426fb..c6d96384 100644 --- a/dedoc/common/exceptions/java_not_found_error.py +++ b/dedoc/common/exceptions/java_not_found_error.py @@ -1,21 +1,18 @@ from typing import Optional -from dedoc.common.exceptions.dedoc_exception import DedocException +from dedoc.common.exceptions.dedoc_error import DedocError -class JavaNotFoundError(DedocException): + +class JavaNotFoundError(DedocError): """ - raise if there is no JAVA + Raise if there is no JAVA """ - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: super(JavaNotFoundError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) def __str__(self) -> str: - return "JavaNotFoundError({})".format(self.msg) + return f"JavaNotFoundError({self.msg})" @property def code(self) -> int: diff --git a/dedoc/common/exceptions/minio_error.py b/dedoc/common/exceptions/minio_error.py new file mode 100644 index 00000000..6d43c64f --- /dev/null +++ b/dedoc/common/exceptions/minio_error.py @@ -0,0 +1,19 @@ +from typing import Optional + +from dedoc.common.exceptions.dedoc_error import DedocError + + +class MinioError(DedocError): + """ + Raise if there is no file in minio + """ + + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: + super(MinioError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + + def __str__(self) -> str: + return f"MinioError({self.msg})" + + @property + def code(self) -> int: + return 404 diff --git a/dedoc/common/exceptions/minio_exception.py b/dedoc/common/exceptions/minio_exception.py deleted file mode 100644 index a19ae189..00000000 --- a/dedoc/common/exceptions/minio_exception.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Optional -from dedoc.common.exceptions.dedoc_exception import DedocException - - -class MinioException(DedocException): - """ - raise if there is no file in minio - """ - - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: - super(MinioException, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return "MinioException({})".format(self.msg) - - @property - def code(self) -> int: - return 404 diff --git a/dedoc/common/exceptions/missing_file_error.py b/dedoc/common/exceptions/missing_file_error.py new file mode 100644 index 00000000..7bc861e9 --- /dev/null +++ b/dedoc/common/exceptions/missing_file_error.py @@ -0,0 +1,19 @@ +from typing import Optional + +from dedoc.common.exceptions.dedoc_error import DedocError + + +class MissingFileError(DedocError): + """ + Raise if there is no file in post request + """ + + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: + super(MissingFileError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + + def __str__(self) -> str: + return f"MissingFileError({self.msg})" + + @property + def code(self) -> int: + return 400 diff --git a/dedoc/common/exceptions/missing_file_exception.py b/dedoc/common/exceptions/missing_file_exception.py deleted file mode 100644 index f6b9d654..00000000 --- a/dedoc/common/exceptions/missing_file_exception.py +++ /dev/null @@ -1,23 +0,0 @@ -from typing import Optional - -from dedoc.common.exceptions.dedoc_exception import DedocException - - -class MissingFileException(DedocException): - """ - raise if there is no file in post request - """ - - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: - super(MissingFileException, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return "MissingFileException({})".format(self.msg) - - @property - def code(self) -> int: - return 400 diff --git a/dedoc/common/exceptions/recognize_error.py b/dedoc/common/exceptions/recognize_error.py new file mode 100644 index 00000000..05c388ce --- /dev/null +++ b/dedoc/common/exceptions/recognize_error.py @@ -0,0 +1,16 @@ +from typing import Optional + +from dedoc.common.exceptions.dedoc_error import DedocError + + +class RecognizeError(DedocError): + + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: + super(RecognizeError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + + def __str__(self) -> str: + return f"RecognizeError({self.msg})" + + @property + def code(self) -> int: + return 500 diff --git a/dedoc/common/exceptions/recognize_exception.py b/dedoc/common/exceptions/recognize_exception.py deleted file mode 100644 index 8e62147e..00000000 --- a/dedoc/common/exceptions/recognize_exception.py +++ /dev/null @@ -1,20 +0,0 @@ -from typing import Optional - -from dedoc.common.exceptions.dedoc_exception import DedocException - - -class RecognizeException(DedocException): - - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: - super(RecognizeException, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return "RecognizeException({})".format(self.msg) - - @property - def code(self) -> int: - return 500 diff --git a/dedoc/common/exceptions/structure_extractor_error.py b/dedoc/common/exceptions/structure_extractor_error.py new file mode 100644 index 00000000..1bb9bd00 --- /dev/null +++ b/dedoc/common/exceptions/structure_extractor_error.py @@ -0,0 +1,19 @@ +from typing import Optional + +from dedoc.common.exceptions.dedoc_error import DedocError + + +class StructureExtractorError(DedocError): + """ + Raise if structure extractor can't build structured document from unstructured one. + """ + + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: + super(StructureExtractorError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + + def __str__(self) -> str: + return f"StructureExtractorError({self.msg})" + + @property + def code(self) -> int: + return 400 diff --git a/dedoc/common/exceptions/structure_extractor_exception.py b/dedoc/common/exceptions/structure_extractor_exception.py deleted file mode 100644 index 76d738a4..00000000 --- a/dedoc/common/exceptions/structure_extractor_exception.py +++ /dev/null @@ -1,23 +0,0 @@ -from typing import Optional - -from dedoc.common.exceptions.dedoc_exception import DedocException - - -class StructureExtractorException(DedocException): - """ - Raise if structure extractor can't build structured document from unstructured one. - """ - - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: - super(StructureExtractorException, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return "StructureExtractorException({})".format(self.msg) - - @property - def code(self) -> int: - return 400 diff --git a/dedoc/common/exceptions/tabby_pdf_error.py b/dedoc/common/exceptions/tabby_pdf_error.py index 1dbb008c..eff2ec8d 100644 --- a/dedoc/common/exceptions/tabby_pdf_error.py +++ b/dedoc/common/exceptions/tabby_pdf_error.py @@ -1,21 +1,18 @@ from typing import Optional -from dedoc.common.exceptions.dedoc_exception import DedocException +from dedoc.common.exceptions.dedoc_error import DedocError -class TabbyPdfError(DedocException): + +class TabbyPdfError(DedocError): """ Error from TabbyPDF """ - def __init__(self, - msg: str, - msg_api: Optional[str] = None, - filename: Optional[str] = None, - version: Optional[str] = None) -> None: + def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: super(TabbyPdfError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) def __str__(self) -> str: - return "TabbyPdfError({})".format(self.msg) + return f"TabbyPdfError({self.msg})" @property def code(self) -> int: diff --git a/dedoc/config.py b/dedoc/config.py index 2664e7dd..34f51297 100644 --- a/dedoc/config.py +++ b/dedoc/config.py @@ -2,14 +2,14 @@ import logging import os import sys -from typing import Optional, Any +from typing import Any, Optional logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s") DEBUG_MODE = False -RESOURCES_PATH = os.environ.get('RESOURCES_PATH', os.path.join(os.path.expanduser('~'), ".cache", "dedoc", "resources")) +RESOURCES_PATH = os.environ.get("RESOURCES_PATH", os.path.join(os.path.expanduser("~"), ".cache", "dedoc", "resources")) _config = dict( # -----------------------------------------RESOURCES PATH SETTINGS---------------------------------------------------- @@ -28,7 +28,7 @@ # max file size in bytes max_content_length=512 * 1024 * 1024, # application port - api_port=int(os.environ.get('DOCREADER_PORT', '1231')), + api_port=int(os.environ.get("DOCREADER_PORT", "1231")), static_files_dirs={}, # log settings logger=logging.getLogger(), @@ -65,7 +65,7 @@ class Configuration(object): __config = None @classmethod - def getInstance(cls: "Configuration") -> "Configuration": + def get_instance(cls: "Configuration") -> "Configuration": """ Actual object creation will happen when we use Configuration.getInstance() """ @@ -74,7 +74,7 @@ def getInstance(cls: "Configuration") -> "Configuration": return cls.__instance - def __initConfig(self, args: Optional[Any] = None) -> None: + def __init_config(self, args: Optional[Any] = None) -> None: if args is not None and args.config_path is not None: spec = importlib.util.spec_from_file_location("config_module", args.config_path) config_module = importlib.util.module_from_spec(spec) @@ -83,11 +83,11 @@ def __initConfig(self, args: Optional[Any] = None) -> None: else: self.__config = _config - def getConfig(self, args: Optional[Any] = None) -> dict: + def get_config(self, args: Optional[Any] = None) -> dict: if self.__config is None or args is not None: - self.__initConfig(args) + self.__init_config(args) return self.__config def get_config(args: Optional[Any] = None) -> dict: - return Configuration.getInstance().getConfig(args) + return Configuration.get_instance().get_config(args) diff --git a/dedoc/converters/concrete_converters/abstract_converter.py b/dedoc/converters/concrete_converters/abstract_converter.py index 2be165be..8dc05d8b 100644 --- a/dedoc/converters/concrete_converters/abstract_converter.py +++ b/dedoc/converters/concrete_converters/abstract_converter.py @@ -3,9 +3,9 @@ import subprocess import time from abc import ABC, abstractmethod -from typing import Optional, List +from typing import List, Optional -from dedoc.common.exceptions.conversion_exception import ConversionException +from dedoc.common.exceptions.conversion_error import ConversionError class AbstractConverter(ABC): @@ -57,12 +57,12 @@ def _run_subprocess(self, command: List[str], filename: str, expected_path: str) else: error_message = f"Could not convert file {filename}\n{error_message}" self.logger.error(error_message) - raise ConversionException(msg=error_message) + raise ConversionError(msg=error_message) except subprocess.TimeoutExpired: message = f"Conversion of the {filename} hadn't terminated after {self.timeout} seconds" self.logger.error(message) - raise ConversionException(msg=message) + raise ConversionError(msg=message) def _await_for_conversion(self, filename: str, tmp_dir: str) -> None: t = 0 @@ -71,4 +71,4 @@ def _await_for_conversion(self, filename: str, tmp_dir: str) -> None: t += self.period_checking if t >= self.timeout: - raise ConversionException(msg=f"fail with {tmp_dir}/{filename}", msg_api=f"Unsupported file format {filename}") + raise ConversionError(msg=f"fail with {tmp_dir}/{filename}", msg_api=f"Unsupported file format {filename}") diff --git a/dedoc/converters/concrete_converters/binary_converter.py b/dedoc/converters/concrete_converters/binary_converter.py index 183fc966..2089d66c 100644 --- a/dedoc/converters/concrete_converters/binary_converter.py +++ b/dedoc/converters/concrete_converters/binary_converter.py @@ -1,8 +1,8 @@ from typing import Optional -from dedoc.utils import supported_image_types from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter from dedoc.converters.concrete_converters.png_converter import PNGConverter +from dedoc.utils import supported_image_types class BinaryConverter(AbstractConverter): @@ -18,7 +18,7 @@ def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = No """ Checks if the document is image-like (e.g. it has .bmp, .jpg, .tiff, etc. extension) and has `mime=application/octet-stream`. """ - return mime == 'application/octet-stream' and extension in supported_image_types + return mime == "application/octet-stream" and extension in supported_image_types def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: """ diff --git a/dedoc/converters/concrete_converters/docx_converter.py b/dedoc/converters/concrete_converters/docx_converter.py index a32af9b9..0a6abd9c 100644 --- a/dedoc/converters/concrete_converters/docx_converter.py +++ b/dedoc/converters/concrete_converters/docx_converter.py @@ -2,7 +2,7 @@ from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.extensions import converted_mimes, converted_extensions +from dedoc.extensions import converted_extensions, converted_mimes class DocxConverter(AbstractConverter): @@ -23,9 +23,9 @@ def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: """ Convert the docx-like documents into files with .docx extension using the soffice application. """ - path_in = f"{tmp_dir}/{filename}{extension}" + path_in = os.path.join(tmp_dir, f"{filename}{extension}") command = ["soffice", "--headless", "--convert-to", "docx", "--outdir", tmp_dir, path_in] - file_out = filename + ".docx" + file_out = f"{filename}.docx" expected_path = os.path.join(tmp_dir, file_out) self._run_subprocess(command=command, filename=filename, expected_path=expected_path) diff --git a/dedoc/converters/concrete_converters/excel_converter.py b/dedoc/converters/concrete_converters/excel_converter.py index bd9ca793..661fb5c2 100644 --- a/dedoc/converters/concrete_converters/excel_converter.py +++ b/dedoc/converters/concrete_converters/excel_converter.py @@ -23,9 +23,9 @@ def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: """ Convert the xlsx-like documents into files with .xlsx extension using the soffice application. """ - path_in = f"{tmp_dir}/{filename}{extension}" + path_in = os.path.join(tmp_dir, f"{filename}{extension}") command = ["soffice", "--headless", "--convert-to", "xlsx", "--outdir", tmp_dir, path_in] - file_out = filename + '.xlsx' + file_out = f"{filename}.xlsx" expected_path = os.path.join(tmp_dir, file_out) self._run_subprocess(command=command, filename=filename, expected_path=expected_path) diff --git a/dedoc/converters/concrete_converters/pdf_converter.py b/dedoc/converters/concrete_converters/pdf_converter.py index 380fd508..1be6d839 100644 --- a/dedoc/converters/concrete_converters/pdf_converter.py +++ b/dedoc/converters/concrete_converters/pdf_converter.py @@ -1,3 +1,4 @@ +import os from typing import Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter @@ -23,9 +24,9 @@ def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: """ Convert the pdf-like documents into files with .pdf extension using the ddjvu application. """ - path_in = f"{tmp_dir}/{filename}{extension}" - expected_path = f"{tmp_dir}/{filename}.pdf" + path_in = os.path.join(tmp_dir, f"{filename}{extension}") + expected_path = os.path.join(tmp_dir, f"{filename}.pdf") command = ["ddjvu", "--format=pdf", path_in, expected_path] self._run_subprocess(command=command, filename=filename, expected_path=expected_path) - return filename + '.pdf' + return filename + ".pdf" diff --git a/dedoc/converters/concrete_converters/png_converter.py b/dedoc/converters/concrete_converters/png_converter.py index f51b0426..3fdcac26 100644 --- a/dedoc/converters/concrete_converters/png_converter.py +++ b/dedoc/converters/concrete_converters/png_converter.py @@ -1,3 +1,4 @@ +import os from typing import Optional import cv2 @@ -25,11 +26,13 @@ def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: """ Convert the image-like documents into files with .png extension. """ - if extension in ['.hdr', '.pic', '.sr', '.ras', '.j2k']: - img = cv2.imread(f"{tmp_dir}/{filename}{extension}") - cv2.imwrite(f"{tmp_dir}/{filename}.png", img) + path_in = os.path.join(tmp_dir, f"{filename}{extension}") + path_out = os.path.join(tmp_dir, f"{filename}.png") + if extension in [".hdr", ".pic", ".sr", ".ras", ".j2k"]: + img = cv2.imread(path_in) + cv2.imwrite(path_out, img) else: - img = Image.open(f"{tmp_dir}/{filename}{extension}") - img.save(f"{tmp_dir}/{filename}.png") + img = Image.open(path_in) + img.save(path_out) - return filename + ".png" + return f"{filename}.png" diff --git a/dedoc/converters/concrete_converters/pptx_converter.py b/dedoc/converters/concrete_converters/pptx_converter.py index a9ab630d..312791fe 100644 --- a/dedoc/converters/concrete_converters/pptx_converter.py +++ b/dedoc/converters/concrete_converters/pptx_converter.py @@ -23,9 +23,9 @@ def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: """ Convert the pptx-like documents into files with .pptx extension using the soffice application. """ - path_in = "{tmp_dir}/{filename}{extension}".format(tmp_dir=tmp_dir, extension=extension, filename=filename) + path_in = os.path.join(tmp_dir, f"{filename}{extension}") command = ["soffice", "--headless", "--convert-to", "pptx", "--outdir", tmp_dir, path_in] - file_out = filename + '.pptx' + file_out = f"{filename}.pptx" expected_path = os.path.join(tmp_dir, file_out) self._run_subprocess(command=command, filename=filename, expected_path=expected_path) diff --git a/dedoc/converters/file_converter.py b/dedoc/converters/file_converter.py index 43b78899..7048d0ac 100644 --- a/dedoc/converters/file_converter.py +++ b/dedoc/converters/file_converter.py @@ -1,11 +1,9 @@ -import inspect import os -import warnings from stat import S_IREAD, S_IRGRP, S_IROTH from typing import List, Optional from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter -from dedoc.utils.utils import splitext_, get_file_mime_type +from dedoc.utils.utils import get_file_mime_type, splitext_ class FileConverterComposition(object): @@ -35,13 +33,7 @@ def do_converting(self, tmp_dir: str, filename: str, parameters: Optional[dict] name, extension = splitext_(filename) mime = get_file_mime_type(os.path.join(tmp_dir, filename)) for converter in self.converters: - if "parameters" in inspect.getfullargspec(converter.can_convert).args: - can_convert = converter.can_convert(extension=extension, mime=mime, parameters=parameters) - else: - warnings.warn("!WARNING! you converter requires an update\n" + - "Please specify parameters argument in method can_convert in {}\n".format(type(converter).__name__) + - " These parameters would be mandatory in the near future") - can_convert = converter.can_convert(extension=extension, mime=mime) + can_convert = converter.can_convert(extension=extension, mime=mime, parameters=parameters) if can_convert: filename = converter.do_convert(tmp_dir, name, extension) break diff --git a/dedoc/data_structures/annotation.py b/dedoc/data_structures/annotation.py index 4c102e86..11bffc01 100644 --- a/dedoc/data_structures/annotation.py +++ b/dedoc/data_structures/annotation.py @@ -35,10 +35,10 @@ def __eq__(self, o: object) -> bool: return self.name == o.name and self.value == o.value and self.start == o.start and self.end == o.end def __str__(self) -> str: - return "{name}({start}:{end}, {value})".format(name=self.name.capitalize(), start=self.start, end=self.end, value=self.value) + return f"{self.name.capitalize()}({self.start}:{self.end}, {self.value})" def __repr__(self) -> str: - return "{name}(...)".format(name=self.name.capitalize()) + return f"{self.name.capitalize()}(...)" def to_dict(self) -> dict: res = OrderedDict() @@ -52,12 +52,12 @@ def to_dict(self) -> dict: def get_api_dict(api: Api) -> Model: names = ["style", "bold", "italic", "underlined", "size", "indentation", "alignment", "table", "attachment", "spacing", "strike", "subscript", "superscript"] - return api.model('Annotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'name': fields.String(description='annotation name', required=True, example='bold', enum=names), - 'value': fields.String(description='annotation value. For example, it may be font size value for size type ' - 'or type of alignment for alignment type', + return api.model("Annotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "name": fields.String(description="annotation name", required=True, example="bold", enum=names), + "value": fields.String(description="annotation value. For example, it may be font size value for size type " + "or type of alignment for alignment type", required=True, example="left") }) diff --git a/dedoc/data_structures/bbox.py b/dedoc/data_structures/bbox.py index 709b54cf..dfdf34b0 100644 --- a/dedoc/data_structures/bbox.py +++ b/dedoc/data_structures/bbox.py @@ -1,5 +1,5 @@ from collections import OrderedDict -from typing import Tuple, Dict +from typing import Dict, Tuple from dedoc.data_structures.serializable import Serializable @@ -46,7 +46,7 @@ def y_bottom_right(self) -> int: return self.y_top_left + self.height def __str__(self) -> str: - return "BBox(x = {} y = {}, w = {}, h = {})".format(self.x_top_left, self.y_top_left, self.width, self.height) + return f"BBox(x = {self.x_top_left} y = {self.y_top_left}, w = {self.width}, h = {self.height})" def __repr__(self) -> str: return self.__str__() @@ -68,10 +68,7 @@ def from_two_points(top_left: Tuple[int, int], bottom_right: Tuple[int, int]) -> """ x_top_left, y_top_left = top_left x_bottom_right, y_bottom_right = bottom_right - return BBox(x_top_left=x_top_left, - y_top_left=y_top_left, - width=x_bottom_right - x_top_left, - height=y_bottom_right - y_top_left) + return BBox(x_top_left=x_top_left, y_top_left=y_top_left, width=x_bottom_right - x_top_left, height=y_bottom_right - y_top_left) def have_intersection_with_box(self, box: "BBox", threshold: float = 0.3) -> bool: """ @@ -81,12 +78,12 @@ def have_intersection_with_box(self, box: "BBox", threshold: float = 0.3) -> boo :param threshold: the lowest value of the intersection over union used get boolean result """ # determine the (x, y)-coordinates of the intersection rectangle - xA = max(self.x_top_left, box.x_top_left) - yA = max(self.y_top_left, box.y_top_left) - xB = min(self.x_top_left + self.width, box.x_top_left + box.width) - yB = min(self.y_top_left + self.height, box.y_top_left + box.height) + x_a = max(self.x_top_left, box.x_top_left) + y_a = max(self.y_top_left, box.y_top_left) + x_b = min(self.x_top_left + self.width, box.x_top_left + box.width) + y_b = min(self.y_top_left + self.height, box.y_top_left + box.height) # compute the area of intersection rectangle - inter_a_area = max(0, xB - xA) * max(0, yB - yA) + inter_a_area = max(0, x_b - x_a) * max(0, y_b - y_a) # compute the area of both the prediction and ground-truth # rectangles box_b_area = float(box.width * box.height) diff --git a/dedoc/data_structures/cell_property.py b/dedoc/data_structures/cell_property.py index 7039e72c..b0186b6c 100644 --- a/dedoc/data_structures/cell_property.py +++ b/dedoc/data_structures/cell_property.py @@ -1,8 +1,8 @@ from collections import OrderedDict from typing import Any -from flask_restx import fields, Api, Model import numpy as np +from flask_restx import Api, Model, fields from dedoc.data_structures.serializable import Serializable @@ -11,7 +11,7 @@ class CellProperty(Serializable): """ This class holds information about the table cell. """ - def __init__(self, cell: Any) -> None: + def __init__(self, cell: Any) -> None: # noqa """ :param cell: class which should contain the following attributes: colspan, rowspan, invisible. """ @@ -28,8 +28,8 @@ def to_dict(self) -> dict: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('CellProperty', { - 'colspan': fields.Integer(description='attribute of union column count'), - 'rowspan': fields.Integer(description='attribute of union row count'), - 'invisible': fields.Boolean(description='flag for cell display (for example: if invisible==true then style=\"display: none\")'), + return api.model("CellProperty", { + "colspan": fields.Integer(description="attribute of union column count"), + "rowspan": fields.Integer(description="attribute of union row count"), + "invisible": fields.Boolean(description='flag for cell display (for example: if invisible==true then style="display: none")'), }) diff --git a/dedoc/data_structures/concrete_annotations/alignment_annotation.py b/dedoc/data_structures/concrete_annotations/alignment_annotation.py index 6f598631..615f8786 100644 --- a/dedoc/data_structures/concrete_annotations/alignment_annotation.py +++ b/dedoc/data_structures/concrete_annotations/alignment_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -22,11 +22,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('AlignmentAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='alignment of the text', - required=True, - example="left", - enum=AlignmentAnnotation.valid_values) + return api.model("AlignmentAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="alignment of the text", required=True, example="left", enum=AlignmentAnnotation.valid_values) }) diff --git a/dedoc/data_structures/concrete_annotations/attach_annotation.py b/dedoc/data_structures/concrete_annotations/attach_annotation.py index c031d949..6b276cbc 100644 --- a/dedoc/data_structures/concrete_annotations/attach_annotation.py +++ b/dedoc/data_structures/concrete_annotations/attach_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import Model, Api, fields +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -21,6 +21,6 @@ def __init__(self, attach_uid: str, start: int, end: int) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('AttachAnnotation', { - 'value': fields.String(description='ref to attachment', required=True, example="attach fafffa145agh") + return api.model("AttachAnnotation", { + "value": fields.String(description="ref to attachment", required=True, example="attach fafffa145agh") }) diff --git a/dedoc/data_structures/concrete_annotations/bbox_annotation.py b/dedoc/data_structures/concrete_annotations/bbox_annotation.py index a74706cb..fdeb145c 100644 --- a/dedoc/data_structures/concrete_annotations/bbox_annotation.py +++ b/dedoc/data_structures/concrete_annotations/bbox_annotation.py @@ -1,8 +1,8 @@ import json -from dedoc.data_structures.annotation import Annotation -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields +from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.bbox import BBox @@ -26,10 +26,10 @@ def __init__(self, start: int, end: int, value: BBox) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('BBoxAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='bounding box of text chunk', + return api.model("BBoxAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="bounding box of text chunk", required=True, example='{"x_top_left": 0, "y_top_left": 0, "width": 70, "height": 20}') }) diff --git a/dedoc/data_structures/concrete_annotations/bold_annotation.py b/dedoc/data_structures/concrete_annotations/bold_annotation.py index 8531a607..871ab166 100644 --- a/dedoc/data_structures/concrete_annotations/bold_annotation.py +++ b/dedoc/data_structures/concrete_annotations/bold_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,11 +24,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('BoldAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='indicator if the text is bold or not', - required=True, - example="True", - enum=BoldAnnotation.valid_values) + return api.model("BoldAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="indicator if the text is bold or not", required=True, example="True", enum=BoldAnnotation.valid_values) }) diff --git a/dedoc/data_structures/concrete_annotations/color_annotation.py b/dedoc/data_structures/concrete_annotations/color_annotation.py index 772fc5fb..4b6983d6 100644 --- a/dedoc/data_structures/concrete_annotations/color_annotation.py +++ b/dedoc/data_structures/concrete_annotations/color_annotation.py @@ -33,4 +33,4 @@ def __init__(self, start: int, end: int, red: float, green: float, blue: float) super().__init__(start=start, end=end, name=ColorAnnotation.name, value=json.dumps(value)) def __str__(self) -> str: - return "ColorAnnotation(red={}, green={}, blue={})".format(self.red, self.green, self.blue) + return f"ColorAnnotation(red={self.red}, green={self.green}, blue={self.blue})" diff --git a/dedoc/data_structures/concrete_annotations/confidence_annotation.py b/dedoc/data_structures/concrete_annotations/confidence_annotation.py index af18120f..d7977935 100644 --- a/dedoc/data_structures/concrete_annotations/confidence_annotation.py +++ b/dedoc/data_structures/concrete_annotations/confidence_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -25,8 +25,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('BoldAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='confidence value', required=True, example="95") + return api.model("BoldAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="confidence value", required=True, example="95") }) diff --git a/dedoc/data_structures/concrete_annotations/indentation_annotation.py b/dedoc/data_structures/concrete_annotations/indentation_annotation.py index 1e431af4..4ecbfd16 100644 --- a/dedoc/data_structures/concrete_annotations/indentation_annotation.py +++ b/dedoc/data_structures/concrete_annotations/indentation_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,10 +24,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('IndentationAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='text indentation in twentieths of a point (1/1440 of an inch)', - required=True, - example="720") + return api.model("IndentationAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="text indentation in twentieths of a point (1/1440 of an inch)", required=True, example="720") }) diff --git a/dedoc/data_structures/concrete_annotations/italic_annotation.py b/dedoc/data_structures/concrete_annotations/italic_annotation.py index 62ed3a28..0cfc83a1 100644 --- a/dedoc/data_structures/concrete_annotations/italic_annotation.py +++ b/dedoc/data_structures/concrete_annotations/italic_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,11 +24,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('ItalicAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='indicator if the text is italic or not', - required=True, - example="True", - enum=ItalicAnnotation.valid_values) + return api.model("ItalicAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="indicator if the text is italic or not", required=True, example="True", enum=ItalicAnnotation.valid_values) }) diff --git a/dedoc/data_structures/concrete_annotations/linked_text_annotation.py b/dedoc/data_structures/concrete_annotations/linked_text_annotation.py index fdcecb76..9bd9228e 100644 --- a/dedoc/data_structures/concrete_annotations/linked_text_annotation.py +++ b/dedoc/data_structures/concrete_annotations/linked_text_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -20,9 +20,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('LinkedTextAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='text, linked to given, for example text of the footnote', - required=True,) + return api.model("LinkedTextAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="text, linked to given, for example text of the footnote", required=True) }) diff --git a/dedoc/data_structures/concrete_annotations/size_annotation.py b/dedoc/data_structures/concrete_annotations/size_annotation.py index 6c42e35a..c82c1df0 100644 --- a/dedoc/data_structures/concrete_annotations/size_annotation.py +++ b/dedoc/data_structures/concrete_annotations/size_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,10 +24,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('SizeAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='the size of the text in points (1/72 of an inch)', - required=True, - example="18.5") + return api.model("SizeAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="the size of the text in points (1/72 of an inch)", required=True, example="18.5") }) diff --git a/dedoc/data_structures/concrete_annotations/spacing_annotation.py b/dedoc/data_structures/concrete_annotations/spacing_annotation.py index 81dace76..ba0c4e1b 100644 --- a/dedoc/data_structures/concrete_annotations/spacing_annotation.py +++ b/dedoc/data_structures/concrete_annotations/spacing_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -19,16 +19,16 @@ def __init__(self, start: int, end: int, value: str) -> None: try: int(value) except ValueError: - raise ValueError("the value of spacing annotation should be a number, get {}".format(value)) + raise ValueError(f"the value of spacing annotation should be a number, get {value}") super().__init__(start=start, end=end, name=SpacingAnnotation.name, value=value) @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('SpacingAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='spacing between the current line and the previous one in ' - 'twentieths of a point or one hundredths of a line', + return api.model("SpacingAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="spacing between the current line and the previous one in " + "twentieths of a point or one hundredths of a line", required=True, example="240") }) diff --git a/dedoc/data_structures/concrete_annotations/strike_annotation.py b/dedoc/data_structures/concrete_annotations/strike_annotation.py index 158353ad..25cc9806 100644 --- a/dedoc/data_structures/concrete_annotations/strike_annotation.py +++ b/dedoc/data_structures/concrete_annotations/strike_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,10 +24,10 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('StrikeAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='indicator if the text is strikethrough or not', + return api.model("StrikeAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="indicator if the text is strikethrough or not", required=True, example="True", enum=StrikeAnnotation.valid_values) diff --git a/dedoc/data_structures/concrete_annotations/style_annotation.py b/dedoc/data_structures/concrete_annotations/style_annotation.py index 2f7fd1c8..234750a3 100644 --- a/dedoc/data_structures/concrete_annotations/style_annotation.py +++ b/dedoc/data_structures/concrete_annotations/style_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -20,10 +20,8 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('StyleAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='style name', - required=True, - example="heading 1") + return api.model("StyleAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="style name", required=True, example="heading 1") }) diff --git a/dedoc/data_structures/concrete_annotations/subscript_annotation.py b/dedoc/data_structures/concrete_annotations/subscript_annotation.py index 9ca3f2ad..db3edbfe 100644 --- a/dedoc/data_structures/concrete_annotations/subscript_annotation.py +++ b/dedoc/data_structures/concrete_annotations/subscript_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,10 +24,10 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('SuperscriptAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='indicator if the text is subscript ($a_1$ in tex) or not', + return api.model("SuperscriptAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="indicator if the text is subscript ($a_1$ in tex) or not", required=True, example="True", enum=SubscriptAnnotation.valid_values) diff --git a/dedoc/data_structures/concrete_annotations/superscript_annotation.py b/dedoc/data_structures/concrete_annotations/superscript_annotation.py index d2e47dee..98611918 100644 --- a/dedoc/data_structures/concrete_annotations/superscript_annotation.py +++ b/dedoc/data_structures/concrete_annotations/superscript_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,10 +24,10 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('SuperscriptAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='indicator if the text is superscript ($a^1$ in tex) or not', + return api.model("SuperscriptAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="indicator if the text is superscript ($a^1$ in tex) or not", required=True, example="True", enum=SuperscriptAnnotation.valid_values) diff --git a/dedoc/data_structures/concrete_annotations/table_annotation.py b/dedoc/data_structures/concrete_annotations/table_annotation.py index 64217713..8842a84d 100644 --- a/dedoc/data_structures/concrete_annotations/table_annotation.py +++ b/dedoc/data_structures/concrete_annotations/table_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import Model, Api, fields +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -20,6 +20,6 @@ def __init__(self, name: str, start: int, end: int) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('TableAnnotation', { - 'value': fields.String(description='ref to table', required=True, example="table fafffa145agh") + return api.model("TableAnnotation", { + "value": fields.String(description="ref to table", required=True, example="table fafffa145agh") }) diff --git a/dedoc/data_structures/concrete_annotations/underlined_annotation.py b/dedoc/data_structures/concrete_annotations/underlined_annotation.py index b5249a56..e77e397c 100644 --- a/dedoc/data_structures/concrete_annotations/underlined_annotation.py +++ b/dedoc/data_structures/concrete_annotations/underlined_annotation.py @@ -1,4 +1,4 @@ -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation @@ -24,10 +24,10 @@ def __init__(self, start: int, end: int, value: str) -> None: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('UnderlinedAnnotation', { - 'start': fields.Integer(description='annotation start index', required=True, example=0), - 'end': fields.Integer(description='annotation end index', required=True, example=4), - 'value': fields.String(description='indicator if the text is underlined or not', + return api.model("UnderlinedAnnotation", { + "start": fields.Integer(description="annotation start index", required=True, example=0), + "end": fields.Integer(description="annotation end index", required=True, example=4), + "value": fields.String(description="indicator if the text is underlined or not", required=True, example="True", enum=UnderlinedAnnotation.valid_values) diff --git a/dedoc/data_structures/document_content.py b/dedoc/data_structures/document_content.py index a810d82a..3cf7c1bb 100644 --- a/dedoc/data_structures/document_content.py +++ b/dedoc/data_structures/document_content.py @@ -1,7 +1,7 @@ from collections import OrderedDict from typing import List -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.serializable import Serializable from dedoc.data_structures.table import Table @@ -12,7 +12,7 @@ class DocumentContent(Serializable): """ This class holds the document content - structured text and tables. """ - def __init__(self, tables: List[Table], structure: 'TreeNode', warnings: List[str] = None) -> None: + def __init__(self, tables: List[Table], structure: TreeNode, warnings: List[str] = None) -> None: """ :param tables: list of document tables :param structure: tree structure in which content of the document is organized @@ -30,7 +30,7 @@ def to_dict(self) -> dict: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('DocumentContent', { - 'structure': fields.Nested(TreeNode.get_api_dict(api), readonly=True, description='document content structure'), - 'tables': fields.List(fields.Nested(Table.get_api_dict(api), description="tables structure")) + return api.model("DocumentContent", { + "structure": fields.Nested(TreeNode.get_api_dict(api), readonly=True, description="document content structure"), + "tables": fields.List(fields.Nested(Table.get_api_dict(api), description="tables structure")) }) diff --git a/dedoc/data_structures/document_metadata.py b/dedoc/data_structures/document_metadata.py index a05777d3..67be8956 100644 --- a/dedoc/data_structures/document_metadata.py +++ b/dedoc/data_structures/document_metadata.py @@ -1,7 +1,7 @@ import uuid from collections import OrderedDict -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.api.models.custom_fields import wild_any_fields from dedoc.data_structures.serializable import Serializable @@ -43,7 +43,7 @@ def __init__(self, self.other_fields = {} if other_fields is not None and len(other_fields) > 0: self.extend_other_fields(other_fields) - self.uid = "doc_uid_auto_{}".format(uuid.uuid1()) if uid is None else uid + self.uid = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid def set_uid(self, uid: str) -> None: self.uid = uid # noqa @@ -79,14 +79,14 @@ def to_dict(self) -> dict: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('DocumentMetadata', { - "uid": fields.String(description='unique document identifier', example="doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0"), - 'file_name': fields.String(description='file name', example="example.odt"), - 'temporary_file_name': fields.String(description='file name', example="123.odt"), - 'size': fields.Integer(description='file size in bytes', example="20060"), - 'modified_time': fields.Integer(description='modification time of the document in the format UnixTime', example="1590579805"), - 'created_time': fields.Integer(description='creation time of the document in the format UnixTime', example="1590579805"), - 'access_time': fields.Integer(description='file access time in format UnixTime', example="1590579805"), - 'file_type': fields.String(description='mime-type file', example="application/vnd.oasis.opendocument.text"), - '[a-z]*': wild_any_fields + return api.model("DocumentMetadata", { + "uid": fields.String(description="unique document identifier", example="doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0"), + "file_name": fields.String(description="file name", example="example.odt"), + "temporary_file_name": fields.String(description="file name", example="123.odt"), + "size": fields.Integer(description="file size in bytes", example="20060"), + "modified_time": fields.Integer(description="modification time of the document in the format UnixTime", example="1590579805"), + "created_time": fields.Integer(description="creation time of the document in the format UnixTime", example="1590579805"), + "access_time": fields.Integer(description="file access time in format UnixTime", example="1590579805"), + "file_type": fields.String(description="mime-type file", example="application/vnd.oasis.opendocument.text"), + "[a-z]*": wild_any_fields }) diff --git a/dedoc/data_structures/hierarchy_level.py b/dedoc/data_structures/hierarchy_level.py index ab0d2cd7..06b74b50 100644 --- a/dedoc/data_structures/hierarchy_level.py +++ b/dedoc/data_structures/hierarchy_level.py @@ -18,7 +18,7 @@ class HierarchyLevel: toc = "toc" header = "header" toc_item = "toc_item" - list = "list" + list = "list" # noqa list_item = "list_item" bullet_list_item = "bullet_list_item" raw_text = "raw_text" diff --git a/dedoc/data_structures/line_metadata.py b/dedoc/data_structures/line_metadata.py index 05381ba2..ccca5c0a 100644 --- a/dedoc/data_structures/line_metadata.py +++ b/dedoc/data_structures/line_metadata.py @@ -1,7 +1,7 @@ from collections import OrderedDict from typing import Optional -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.api.models.custom_fields import wild_any_fields, wild_forbid_fields from dedoc.data_structures.hierarchy_level import HierarchyLevel @@ -62,12 +62,12 @@ def to_dict(self) -> dict: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('LineMetadata', { - 'paragraph_type': fields.String(description="paragraph type (header, list_item, list) and etc.", required=True, example="header"), - 'page_id': fields.Integer(description="page number of begin paragraph", required=False, example=0), - 'line_id': fields.Integer(description="line number of begin paragraph", required=True, example=13), - '_*': wild_forbid_fields, # don't get private fields - 'tag_hierarchy_level': wild_forbid_fields, - 'hierarchy_level': wild_forbid_fields, - '[a-z]*': wild_any_fields + return api.model("LineMetadata", { + "paragraph_type": fields.String(description="paragraph type (header, list_item, list) and etc.", required=True, example="header"), + "page_id": fields.Integer(description="page number of begin paragraph", required=False, example=0), + "line_id": fields.Integer(description="line number of begin paragraph", required=True, example=13), + "_*": wild_forbid_fields, # don't get private fields + "tag_hierarchy_level": wild_forbid_fields, + "hierarchy_level": wild_forbid_fields, + "[a-z]*": wild_any_fields }) diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py index c73626dc..74321548 100644 --- a/dedoc/data_structures/line_with_meta.py +++ b/dedoc/data_structures/line_with_meta.py @@ -1,5 +1,5 @@ import re -from typing import List, Union, Sized +from typing import List, Sized, Union from uuid import uuid1 from dedoc.data_structures.annotation import Annotation diff --git a/dedoc/data_structures/parsed_document.py b/dedoc/data_structures/parsed_document.py index 81cd1bf2..9483ecb1 100644 --- a/dedoc/data_structures/parsed_document.py +++ b/dedoc/data_structures/parsed_document.py @@ -1,7 +1,7 @@ from collections import OrderedDict from typing import List, Optional -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields import dedoc from dedoc.data_structures.document_content import DocumentContent @@ -43,20 +43,19 @@ def to_dict(self, depth: int = 0) -> dict: res["warnings"] = self.warnings res["content"] = self.content.to_dict() if self.content is not None else [] res["metadata"] = self.metadata.to_dict() - res["attachments"] = [attachment.to_dict(depth=depth + 1) for attachment in self.attachments] \ - if self.attachments is not None and depth < 10 else [] + res["attachments"] = [attachment.to_dict(depth=depth + 1) for attachment in self.attachments] if self.attachments is not None and depth < 10 else [] return res @staticmethod - def get_api_dict(api: Api, depth: int = 0, name: str = 'ParsedDocument') -> Model: + def get_api_dict(api: Api, depth: int = 0, name: str = "ParsedDocument") -> Model: return api.model(name, { - 'content': fields.Nested(DocumentContent.get_api_dict(api), description='Document content structure'), - 'metadata': fields.Nested(DocumentMetadata.get_api_dict(api), allow_null=False, skip_none=True, description='Document meta information'), - 'version': fields.String(description='the version of the program that parsed this document', example="0.9.1"), - 'warnings': fields.List(fields.String(description='list of warnings and possible errors', example="DOCX: seems that document corrupted")), - 'attachments': fields.List(fields.Nested(api.model('others_ParsedDocument', {})), description='structure of attachments', required=False) + "content": fields.Nested(DocumentContent.get_api_dict(api), description="Document content structure"), + "metadata": fields.Nested(DocumentMetadata.get_api_dict(api), allow_null=False, skip_none=True, description="Document meta information"), + "version": fields.String(description="the version of the program that parsed this document", example="0.9.1"), + "warnings": fields.List(fields.String(description="list of warnings and possible errors", example="DOCX: seems that document corrupted")), + "attachments": fields.List(fields.Nested(api.model("others_ParsedDocument", {})), description="structure of attachments", required=False) if depth == 10 # TODO delete this - else fields.List(fields.Nested(ParsedDocument.get_api_dict(api, depth=depth + 1, name='refParsedDocument' + str(depth)), - description='Attachment structure', + else fields.List(fields.Nested(ParsedDocument.get_api_dict(api, depth=depth + 1, name="refParsedDocument" + str(depth)), + description="Attachment structure", required=False))}) diff --git a/dedoc/data_structures/table.py b/dedoc/data_structures/table.py index 97922ac7..c9d45b68 100644 --- a/dedoc/data_structures/table.py +++ b/dedoc/data_structures/table.py @@ -1,6 +1,7 @@ from collections import OrderedDict -from typing import List, Optional, Any -from flask_restx import fields, Api, Model +from typing import Any, List, Optional + +from flask_restx import Api, Model, fields from dedoc.data_structures.cell_property import CellProperty from dedoc.data_structures.serializable import Serializable @@ -31,7 +32,7 @@ def to_dict(self) -> dict: @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('Table', { - 'cells': fields.List(fields.List(fields.String(description="Cell contains text")), description="matrix of cells"), - 'metadata': fields.Nested(TableMetadata.get_api_dict(api), readonly=True, description='Table meta information') + return api.model("Table", { + "cells": fields.List(fields.List(fields.String(description="Cell contains text")), description="matrix of cells"), + "metadata": fields.Nested(TableMetadata.get_api_dict(api), readonly=True, description="Table meta information") }) diff --git a/dedoc/data_structures/table_metadata.py b/dedoc/data_structures/table_metadata.py index 86d3f63a..be5c3fe0 100644 --- a/dedoc/data_structures/table_metadata.py +++ b/dedoc/data_structures/table_metadata.py @@ -1,9 +1,10 @@ from collections import OrderedDict -from typing import Optional, List -from flask_restx import fields, Api, Model +from typing import List, Optional + +from flask_restx import Api, Model, fields -from dedoc.data_structures.serializable import Serializable from dedoc.data_structures.cell_property import CellProperty +from dedoc.data_structures.serializable import Serializable class TableMetadata(Serializable): @@ -28,17 +29,16 @@ def to_dict(self) -> dict: res["uid"] = self.uid res["page_id"] = self.page_id res["is_inserted"] = self.is_inserted - res["cell_properties"] = [[cell_prop.to_dict() for cell_prop in row_prop] - for row_prop in self.cell_properties] if self.cell_properties else None + res["cell_properties"] = [[cell_prop.to_dict() for cell_prop in row_prop] for row_prop in self.cell_properties] if self.cell_properties else None return res @staticmethod def get_api_dict(api: Api) -> Model: - return api.model('TableMetadata', { - 'page_id': fields.Integer(readonly=False, description='table start page number'), - 'uid': fields.String(description="table unique id"), - 'is_inserted': fields.Boolean(description="was the table inserted into document body"), - 'cell_properties': fields.List(fields.List(fields.Nested(CellProperty.get_api_dict(api), + return api.model("TableMetadata", { + "page_id": fields.Integer(readonly=False, description="table start page number"), + "uid": fields.String(description="table unique id"), + "is_inserted": fields.Boolean(description="was the table inserted into document body"), + "cell_properties": fields.List(fields.List(fields.Nested(CellProperty.get_api_dict(api), description="cell properties, colspan, rowspan, etc", allow_null=True, skip_none=True))) diff --git a/dedoc/data_structures/tree_node.py b/dedoc/data_structures/tree_node.py index 3884d8fa..454e5059 100644 --- a/dedoc/data_structures/tree_node.py +++ b/dedoc/data_structures/tree_node.py @@ -1,14 +1,14 @@ from collections import OrderedDict from typing import List, Optional -from flask_restx import fields, Api, Model +from flask_restx import Api, Model, fields from dedoc.data_structures.annotation import Annotation -from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.serializable import Serializable from dedoc.utils.annotation_merger import AnnotationMerger -from dedoc.data_structures.hierarchy_level import HierarchyLevel class TreeNode(Serializable): @@ -48,24 +48,21 @@ def to_dict(self) -> dict: return res @staticmethod - def get_api_dict(api: Api, depth: int = 0, name: str = 'TreeNode') -> Model: + def get_api_dict(api: Api, depth: int = 0, name: str = "TreeNode") -> Model: return api.model(name, { - 'node_id': fields.String(description="Document element identifier. It is unique within one tree (i.e. " + "node_id": fields.String(description="Document element identifier. It is unique within one tree (i.e. " "there will be no other such node_id in this tree, but in attachment " "it may occur) The identifier has the form 0.2.1 where each number " "means a serial number at the corresponding level of the hierarchy.", required=True, - example="0.2.1" - ), - 'text': fields.String(description="text of node", required=True, example="Закон"), - 'annotations': fields.List(fields.Nested(Annotation.get_api_dict(api), - description="Text annotations (font, size, bold, italic and etc)")), - 'metadata': fields.Nested(LineMetadata.get_api_dict(api), skip_none=True, allow_null=False, description="Line meta information"), - 'subparagraphs': fields.List(fields.Nested(api.model('others_TreeNode', {})), - description="Node childes (with type 'TreeNode') of structure tree") + example="0.2.1"), + "text": fields.String(description="text of node", required=True, example="Закон"), + "annotations": fields.List(fields.Nested(Annotation.get_api_dict(api), description="Text annotations (font, size, bold, italic and etc)")), + "metadata": fields.Nested(LineMetadata.get_api_dict(api), skip_none=True, allow_null=False, description="Line meta information"), + "subparagraphs": fields.List(fields.Nested(api.model("others_TreeNode", {})), description='Node childes (with type "TreeNode") of structure tree') if depth == 30 # TODO delete this - else fields.List(fields.Nested(TreeNode.get_api_dict(api, depth=depth + 1, name='refTreeNode' + str(depth))), - description="Node childes (with type 'TreeNode') of structure tree") + else fields.List(fields.Nested(TreeNode.get_api_dict(api, depth=depth + 1, name="refTreeNode" + str(depth))), + description='Node childes (with type "TreeNode") of structure tree') }) @staticmethod @@ -101,7 +98,7 @@ def add_child(self, line: LineWithMeta) -> "TreeNode": :return: return created node (child of the self) """ new_node = TreeNode( - node_id=self.node_id + ".{}".format(len(self.subparagraphs)), + node_id=f"{self.node_id}.{len(self.subparagraphs)}", text=line.line, annotations=line.annotations, metadata=line.metadata, diff --git a/dedoc/dedoc_manager.py b/dedoc/dedoc_manager.py index 62d7faa7..0079d5c6 100644 --- a/dedoc/dedoc_manager.py +++ b/dedoc/dedoc_manager.py @@ -2,14 +2,14 @@ import os.path import shutil import tempfile -from typing import Optional, Dict +from typing import Dict, Optional -from dedoc.common.exceptions.dedoc_exception import DedocException +from dedoc.common.exceptions.dedoc_error import DedocError from dedoc.config import get_config -from dedoc.manager_config import get_manager_config from dedoc.data_structures import ParsedDocument, UnstructuredDocument +from dedoc.manager_config import get_manager_config from dedoc.metadata_extractors import BaseMetadataExtractor -from dedoc.train_dataset.train_dataset_utils import save_line_with_meta, get_path_original_documents +from dedoc.train_dataset.train_dataset_utils import get_path_original_documents, save_line_with_meta from dedoc.utils.utils import get_unique_name @@ -67,7 +67,7 @@ def parse(self, file_path: str, parameters: Optional[Dict[str, str]] = None) -> try: return self.__parse_no_error_handling(file_path=file_path, parameters=parameters) - except DedocException as e: + except DedocError as e: file_dir, file_name = os.path.split(file_path) e.filename = file_name e.metadata = BaseMetadataExtractor._get_base_meta_information(directory=file_dir, filename=file_name, name_actual=file_name) diff --git a/dedoc/download_models.py b/dedoc/download_models.py index dd511463..9af9abd9 100644 --- a/dedoc/download_models.py +++ b/dedoc/download_models.py @@ -6,7 +6,6 @@ from dedoc.config import get_config - """ These are versions of the models that are used at the current moment - hashes of commits from https://huggingface.co/dedoc. Keys are the names of repositories with models. diff --git a/dedoc/extensions.py b/dedoc/extensions.py index a21fd585..3e8d326a 100644 --- a/dedoc/extensions.py +++ b/dedoc/extensions.py @@ -3,38 +3,38 @@ from dedoc.utils.utils import get_extensions_by_mimes -Extensions = namedtuple('Parts', 'excel_like_format ' - 'pptx_like_format ' - 'csv_like_format ' - 'docx_like_format ' - 'archive_like_format ' - 'image_like_format ' - 'pdf_like_format ' - 'txt_like_format') +Extensions = namedtuple("Parts", "excel_like_format " + "pptx_like_format " + "csv_like_format " + "docx_like_format " + "archive_like_format " + "image_like_format " + "pdf_like_format " + "txt_like_format") converted_extensions = Extensions( - excel_like_format=['.ods', 'xls'], - docx_like_format=['.odt', '.doc'], - pptx_like_format=['.odp', '.ppt'], + excel_like_format=[".ods", "xls"], + docx_like_format=[".odt", ".doc"], + pptx_like_format=[".odp", ".ppt"], archive_like_format=[], - image_like_format=['.pcx', '.webp', '.sgi', '.hdr', '.sr', '.pic', '.dib', '.jfif', '.j2k'], + image_like_format=[".pcx", ".webp", ".sgi", ".hdr", ".sr", ".pic", ".dib", ".jfif", ".j2k"], pdf_like_format=[], csv_like_format=[], - txt_like_format=['.xml'] + txt_like_format=[".xml"] ) # .sgi, .hdr, .sr, .ras - не зарегистрованы в mime converted_mimes = Extensions( excel_like_format=["application/vnd.oasis.opendocument.spreadsheet", "application/vnd.ms-excel"], - docx_like_format=['application/msword', "application/vnd.oasis.opendocument.text"], - pptx_like_format=['application/vnd.openxmlformats-officedocument.presentationml.presentation', - 'application/vnd.ms-powerpoint', 'application/vnd.oasis.opendocument.presentation'], + docx_like_format=["application/msword", "application/vnd.oasis.opendocument.text"], + pptx_like_format=["application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.ms-powerpoint", "application/vnd.oasis.opendocument.presentation"], archive_like_format=[], - image_like_format=['image/gif', - 'image/x-portable-pixmap', 'image/x-portable-anymap', 'image/x-portable-graymap', - 'image/x-portable-bitmap', 'image/x-pcx', 'image/x-pict', - 'application/postscript', 'image/x-cmu-raster'], - pdf_like_format=['image/vnd.djvu'], + image_like_format=["image/gif", + "image/x-portable-pixmap", "image/x-portable-anymap", "image/x-portable-graymap", + "image/x-portable-bitmap", "image/x-pcx", "image/x-pict", + "application/postscript", "image/x-cmu-raster"], + pdf_like_format=["image/vnd.djvu"], csv_like_format=[], txt_like_format=["application/xml", "text/xml"] ) @@ -51,14 +51,14 @@ ) recognized_mimes = Extensions( - excel_like_format=['application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.ms-excel'], - docx_like_format=['application/vnd.openxmlformats-officedocument.wordprocessingml.document'], - pptx_like_format=['application/vnd.openxmlformats-officedocument.presentationml.presentation'], - archive_like_format=['application/zip', 'application/x-tar', 'application/x-rar-compressed', 'application/rar', 'application/x-7z-compressed'], - image_like_format=['image/jpeg', 'image/png', 'image/tiff', 'image/x-ms-bmp', 'image/bmp'], - pdf_like_format=['application/pdf'], + excel_like_format=["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.ms-excel"], + docx_like_format=["application/vnd.openxmlformats-officedocument.wordprocessingml.document"], + pptx_like_format=["application/vnd.openxmlformats-officedocument.presentationml.presentation"], + archive_like_format=["application/zip", "application/x-tar", "application/x-rar-compressed", "application/rar", "application/x-7z-compressed"], + image_like_format=["image/jpeg", "image/png", "image/tiff", "image/x-ms-bmp", "image/bmp"], + pdf_like_format=["application/pdf"], csv_like_format=[], - txt_like_format=['text/plain', 'text/html'] + txt_like_format=["text/plain", "text/html"] ) diff --git a/dedoc/main.py b/dedoc/main.py index b784fad0..e80ef2e4 100644 --- a/dedoc/main.py +++ b/dedoc/main.py @@ -1,7 +1,7 @@ import argparse +from dedoc.api.dedoc_api import get_api, run_api # noqa from dedoc.config import Configuration, get_config -from dedoc.api.dedoc_api import run_api, get_api # noqa def main() -> None: @@ -12,11 +12,11 @@ def main() -> None: parser_config = argparse.ArgumentParser() parser_config.add_argument("-c", "--config_path", help="path to configuration file") parser_config.add_argument("-m", "--module", help="Only for tests") - parser_config.add_argument("-f", "--test_files", metavar="VALUE", nargs='*', help="Only for tests") - parser_config.add_argument('-v', "--unitest_verbose_mode", nargs='?', help="to enable verbose mode of unittest. Only for tests") + parser_config.add_argument("-f", "--test_files", metavar="VALUE", nargs="*", help="Only for tests") + parser_config.add_argument("-v", "--unitest_verbose_mode", nargs="?", help="to enable verbose mode of unittest. Only for tests") args_config = parser_config.parse_args() - Configuration.getInstance().getConfig(args_config) + Configuration.get_instance().get_config(args_config) config = get_config() if config.get("labeling_mode", False): diff --git a/dedoc/manager_config.py b/dedoc/manager_config.py index f2cd01c7..b7993f53 100644 --- a/dedoc/manager_config.py +++ b/dedoc/manager_config.py @@ -1,3 +1,5 @@ +from typing import Optional + from dedoc.attachments_handler.attachments_handler import AttachmentsHandler from dedoc.converters.concrete_converters.binary_converter import BinaryConverter from dedoc.converters.concrete_converters.docx_converter import DocxConverter @@ -114,7 +116,7 @@ class ConfigurationManager(object): __config = None @classmethod - def getInstance(cls: "ConfigurationManager") -> "ConfigurationManager": + def get_instance(cls: "ConfigurationManager") -> "ConfigurationManager": """ Actual object creation will happen when we use ConfigurationManager.getInstance() """ @@ -123,17 +125,17 @@ def getInstance(cls: "ConfigurationManager") -> "ConfigurationManager": return cls.__instance - def initConfig(self, config: dict, new_config: dict = None) -> None: + def init_config(self, config: dict, new_config: Optional[dict] = None) -> None: if new_config is None: self.__config = _get_manager_config(config) else: self.__config = new_config - def getConfig(self, config: dict) -> dict: + def get_config(self, config: dict) -> dict: if self.__config is None: - self.initConfig(config) + self.init_config(config) return self.__config def get_manager_config(config: dict) -> dict: - return ConfigurationManager().getInstance().getConfig(config) + return ConfigurationManager().get_instance().get_config(config) diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py index b1e2399e..5d37ad61 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py @@ -4,7 +4,7 @@ from typing import Optional, Union import piexif -from PIL import Image, ExifTags +from PIL import ExifTags, Image from dateutil import parser from dedoc.data_structures.unstructured_document import UnstructuredDocument diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py index 14839416..2708e5e6 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py @@ -2,7 +2,7 @@ import pickle from typing import Optional -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException +from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor @@ -46,20 +46,20 @@ def add_metadata(self, try: file_path = os.path.join(directory, filename) - with open(file_path, 'rb') as infile: + with open(file_path, "rb") as infile: note_dict = pickle.load(infile) - fields = {"author": note_dict['author']} + fields = {"author": note_dict["author"]} other_fields = {**other_fields, **fields} if other_fields is not None else fields meta_info = dict(file_name=original_filename, file_type="note", - size=note_dict['size'], - access_time=note_dict['modified_time'], - created_time=note_dict['created_time'], - modified_time=note_dict['modified_time'], + size=note_dict["size"], + access_time=note_dict["modified_time"], + created_time=note_dict["created_time"], + modified_time=note_dict["modified_time"], other_fields=other_fields) document.metadata = meta_info return document except Exception: - raise BadFileFormatException(f"Bad note file:\n file_name = {os.path.basename(filename)}. Seems note-format is broken") + raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(filename)}. Seems note-format is broken") diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py index 74660e9e..96682fc0 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py @@ -90,7 +90,7 @@ def _get_pdf_info(self, path: str) -> dict: except PdfReadError: return {"broken_pdf": True} except Exception as e: - self.logger.warning("exception while extract pdf metadata: {} {}".format(path, e)) + self.logger.warning(f"exception while extract pdf metadata: {path} {e}") if self.config.get("debug_mode", False): raise e return {"broken_pdf": True} diff --git a/dedoc/readers/archive_reader/archive_reader.py b/dedoc/readers/archive_reader/archive_reader.py index e493aff2..80b73adc 100644 --- a/dedoc/readers/archive_reader/archive_reader.py +++ b/dedoc/readers/archive_reader/archive_reader.py @@ -4,12 +4,12 @@ import uuid import zipfile import zlib -from typing import List, Optional, IO, Iterator +from typing import IO, Iterator, List, Optional import py7zlib import rarfile -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException +from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.attached_file import AttachedFile from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes @@ -47,30 +47,30 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio def __get_attachments(self, path: str) -> List[AttachedFile]: tmp_dir = os.path.dirname(path) mime = get_file_mime_type(path) - if zipfile.is_zipfile(path) and mime == 'application/zip': + if zipfile.is_zipfile(path) and mime == "application/zip": return list(self.__read_zip_archive(path=path, tmp_dir=tmp_dir)) if tarfile.is_tarfile(path): return list(self.__read_tar_archive(path=path, tmp_dir=tmp_dir)) if rarfile.is_rarfile(path): return list(self.__read_rar_archive(path=path, tmp_dir=tmp_dir)) - if mime == 'application/x-7z-compressed': + if mime == "application/x-7z-compressed": return list(self.__read_7z_archive(path=path, tmp_dir=tmp_dir)) # if no one can handle this archive raise exception - raise BadFileFormatException("bad archive {}".format(path)) + raise BadFileFormatError(f"bad archive {path}") def __read_zip_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]: try: - with zipfile.ZipFile(path, 'r') as arch_file: + with zipfile.ZipFile(path, "r") as arch_file: names = [member.filename for member in arch_file.infolist() if member.file_size > 0] for name in names: with arch_file.open(name) as file: yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file) except (zipfile.BadZipFile, zlib.error) as e: - self.logger.warning("Can't read file {} ({})".format(path, e)) - raise BadFileFormatException("Can't read file {} ({})".format(path, e)) + self.logger.warning(f"Can't read file {path} ({e})") + raise BadFileFormatError(f"Can't read file {path} ({e})") def __read_tar_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]: - with tarfile.open(path, 'r') as arch_file: + with tarfile.open(path, "r") as arch_file: names = [member.name for member in arch_file.getmembers() if member.isfile()] for name in names: file = arch_file.extractfile(name) @@ -78,7 +78,7 @@ def __read_tar_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]: file.close() def __read_rar_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]: - with rarfile.RarFile(path, 'r') as arch_file: + with rarfile.RarFile(path, "r") as arch_file: names = [item.filename for item in arch_file.infolist() if item.compress_size > 0] for name in names: with arch_file.open(name) as file: @@ -102,6 +102,6 @@ def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes]) -> original_name=file_name, tmp_file_path=os.path.join(tmp_dir, tmp_path), need_content_analysis=True, - uid="attach_{}".format(uuid.uuid1()) + uid=f"attach_{uuid.uuid1()}" ) return attachment diff --git a/dedoc/readers/csv_reader/csv_reader.py b/dedoc/readers/csv_reader/csv_reader.py index 872b9e23..9b639506 100644 --- a/dedoc/readers/csv_reader/csv_reader.py +++ b/dedoc/readers/csv_reader/csv_reader.py @@ -1,5 +1,5 @@ import csv -from typing import Optional, Tuple, List +from typing import List, Optional, Tuple from dedoc.data_structures.table import Table from dedoc.data_structures.table_metadata import TableMetadata @@ -38,7 +38,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio data = list(csv_reader) table_metadata = TableMetadata(page_id=0) tables = [Table(cells=data, metadata=table_metadata)] - warnings = ["delimiter is '{}'".format(delimiter)] + warnings = [f"delimiter is '{delimiter}'"] warnings.extend(encoding_warning) return UnstructuredDocument(lines=[], tables=tables, attachments=[], warnings=warnings) diff --git a/dedoc/readers/docx_reader/data_structures/base_props.py b/dedoc/readers/docx_reader/data_structures/base_props.py index 841e183a..c439c3d0 100644 --- a/dedoc/readers/docx_reader/data_structures/base_props.py +++ b/dedoc/readers/docx_reader/data_structures/base_props.py @@ -8,7 +8,7 @@ def __init__(self, properties: Optional["BaseProperties"] = None) -> None: # no Contains style properties for paragraphs and runs. :param properties: Paragraph or Run for copying its properties """ - self.jc = properties.jc if properties else 'left' + self.jc = properties.jc if properties else "left" self.indentation = properties.indentation if properties and properties.indentation else 0 self.size = properties.size if properties else 0 self.bold = properties.bold if properties else False diff --git a/dedoc/readers/docx_reader/data_structures/docx_document.py b/dedoc/readers/docx_reader/data_structures/docx_document.py index 901f4750..41190ede 100644 --- a/dedoc/readers/docx_reader/data_structures/docx_document.py +++ b/dedoc/readers/docx_reader/data_structures/docx_document.py @@ -4,20 +4,20 @@ import re import zipfile from collections import defaultdict -from typing import Optional, List +from typing import List, Optional from bs4 import BeautifulSoup, Tag -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException +from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.attached_file import AttachedFile from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph from dedoc.readers.docx_reader.data_structures.table import DocxTable from dedoc.readers.docx_reader.data_structures.utils import Counter from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor +from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor from dedoc.readers.docx_reader.styles_extractor import StylesExtractor from dedoc.utils.utils import calculate_file_hash @@ -30,15 +30,15 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L self.path_hash = calculate_file_hash(path=path) self.attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments} - self.document_bs_tree = self.__get_bs_tree('word/document.xml') + self.document_bs_tree = self.__get_bs_tree("word/document.xml") if self.document_bs_tree is None: - self.document_bs_tree = self.__get_bs_tree('word/document2.xml') + self.document_bs_tree = self.__get_bs_tree("word/document2.xml") self.body = self.document_bs_tree.body if self.document_bs_tree else None - self.footnote_extractor = FootnoteExtractor(self.__get_bs_tree('word/footnotes.xml')) - self.endnote_extractor = FootnoteExtractor(self.__get_bs_tree('word/endnotes.xml'), key="endnote") - self.styles_extractor = StylesExtractor(self.__get_bs_tree('word/styles.xml'), logger) - num_tree = self.__get_bs_tree('word/numbering.xml') + self.footnote_extractor = FootnoteExtractor(self.__get_bs_tree("word/footnotes.xml")) + self.endnote_extractor = FootnoteExtractor(self.__get_bs_tree("word/endnotes.xml"), key="endnote") + self.styles_extractor = StylesExtractor(self.__get_bs_tree("word/styles.xml"), logger) + num_tree = self.__get_bs_tree("word/numbering.xml") self.numbering_extractor = NumberingExtractor(num_tree, self.styles_extractor) if num_tree else None self.styles_extractor.numbering_extractor = self.numbering_extractor @@ -65,7 +65,7 @@ def __get_lines(self, logger: logging.Logger) -> List[LineWithMeta]: if not isinstance(paragraph_xml, Tag): continue - if paragraph_xml.name == 'tbl': + if paragraph_xml.name == "tbl": self.__handle_table_xml(paragraph_xml, table_refs, uids_set, cnt) continue @@ -73,14 +73,14 @@ def __get_lines(self, logger: logging.Logger) -> List[LineWithMeta]: self.__handle_diagram_xml(paragraph_xml, diagram_refs, uids_set, cnt) continue - if paragraph_xml.name != 'p': - for subparagraph_xml in paragraph_xml.find_all('w:p'): # TODO check what to add + if paragraph_xml.name != "p": + for subparagraph_xml in paragraph_xml.find_all("w:p"): # TODO check what to add paragraph = self.__xml2paragraph(subparagraph_xml, uids_set, cnt) self.paragraph_list.append(paragraph) continue self.paragraph_list.append(self.__xml2paragraph(paragraph_xml, uids_set, cnt)) - images = paragraph_xml.find_all('pic:pic') + images = paragraph_xml.find_all("pic:pic") if images: self.__handle_images_xml(images, image_refs, uids_set, cnt) @@ -124,12 +124,12 @@ def __get_bs_tree(self, filename: str) -> Optional[BeautifulSoup]: with zipfile.ZipFile(self.path) as document: content = document.read(filename) content = re.sub(br"\n[\t ]*", b"", content) - soup = BeautifulSoup(content, 'xml') + soup = BeautifulSoup(content, "xml") return soup except KeyError: return None except zipfile.BadZipFile: - raise BadFileFormatException("Bad docx file:\n file_name = {}. Seems docx is broken".format(os.path.basename(self.path))) + raise BadFileFormatError(f"Bad docx file:\n file_name = {os.path.basename(self.path)}. Seems docx is broken") def __xml2paragraph(self, paragraph_xml: Tag, uids_set: set, cnt: Counter) -> Paragraph: uid = self.__get_paragraph_uid(paragraph_xml=paragraph_xml, uids_set=uids_set) @@ -146,12 +146,12 @@ def __xml2paragraph(self, paragraph_xml: Tag, uids_set: set, cnt: Counter) -> Pa def __get_paragraph_uid(self, paragraph_xml: Tag, uids_set: set) -> str: xml_hash = hashlib.md5(paragraph_xml.encode()).hexdigest() - raw_uid = '{}_{}'.format(self.path_hash, xml_hash) + raw_uid = f"{self.path_hash}_{xml_hash}" uid = raw_uid n = 0 while uid in uids_set: n += 1 - uid = raw_uid + "_{}".format(n) + uid = f"{raw_uid}_{n}" uids_set.add(uid) return uid @@ -168,13 +168,13 @@ def __handle_table_xml(self, xml: Tag, table_refs: dict, uids_set: set, cnt: Cou table_refs[len(self.paragraph_list) - 1].append(table_uid) def __handle_images_xml(self, xmls: List[Tag], image_refs: dict, uids_set: set, cnt: Counter) -> None: - rels = self.__get_bs_tree('word/_rels/document.xml.rels') + rels = self.__get_bs_tree("word/_rels/document.xml.rels") if rels is None: - rels = self.__get_bs_tree('word/_rels/document2.xml.rels') + rels = self.__get_bs_tree("word/_rels/document2.xml.rels") images_rels = dict() - for rel in rels.find_all('Relationship'): - if rel["Target"].startswith('media/'): + for rel in rels.find_all("Relationship"): + if rel["Target"].startswith("media/"): images_rels[rel["Id"]] = rel["Target"][6:] self.__prepare_paragraph_list(uids_set, cnt) @@ -208,5 +208,5 @@ def __prepare_paragraph_list(self, uids_set: set, cnt: Counter) -> None: break if not self.paragraph_list: - empty_paragraph = self.__xml2paragraph(BeautifulSoup('').body.contents[0], uids_set, cnt) + empty_paragraph = self.__xml2paragraph(BeautifulSoup("").body.contents[0], uids_set, cnt) self.paragraph_list.append(empty_paragraph) diff --git a/dedoc/readers/docx_reader/data_structures/paragraph.py b/dedoc/readers/docx_reader/data_structures/paragraph.py index 08a9cac9..862dba1d 100644 --- a/dedoc/readers/docx_reader/data_structures/paragraph.py +++ b/dedoc/readers/docx_reader/data_structures/paragraph.py @@ -1,4 +1,5 @@ from typing import Optional + from bs4 import Tag from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties @@ -6,7 +7,7 @@ from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor from dedoc.readers.docx_reader.properties_extractor import change_paragraph_properties, change_run_properties -from dedoc.readers.docx_reader.styles_extractor import StylesExtractor, StyleType +from dedoc.readers.docx_reader.styles_extractor import StyleType, StylesExtractor class Paragraph(BaseProperties): @@ -63,7 +64,7 @@ def __parse(self) -> None: # 3) paragraph styles # 4) numbering styles within styles_extractor if self.xml.pStyle: - self.styles_extractor.parse(self.xml.pStyle['w:val'], self, StyleType.PARAGRAPH) + self.styles_extractor.parse(self.xml.pStyle["w:val"], self, StyleType.PARAGRAPH) # 5) character style parsed later for each run # 6) paragraph direct formatting @@ -111,13 +112,13 @@ def __make_run_list(self) -> None: """ Make runs of the paragraph and adds them to the paragraph list. """ - run_list = self.xml.find_all('w:r') + run_list = self.xml.find_all("w:r") for run_tree in run_list: new_run = Run(self, self.styles_extractor) if run_tree.rStyle: - self.styles_extractor.parse(run_tree.rStyle['w:val'], new_run, StyleType.CHARACTER) + self.styles_extractor.parse(run_tree.rStyle["w:val"], new_run, StyleType.CHARACTER) if self.xml.pPr and self.xml.pPr.rPr: change_run_properties(new_run, self.xml.pPr.rPr) diff --git a/dedoc/readers/docx_reader/data_structures/run.py b/dedoc/readers/docx_reader/data_structures/run.py index 8ffc2f46..138ef6c0 100644 --- a/dedoc/readers/docx_reader/data_structures/run.py +++ b/dedoc/readers/docx_reader/data_structures/run.py @@ -1,4 +1,5 @@ from typing import Optional + from bs4 import Tag from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties @@ -31,12 +32,12 @@ def get_text(self, xml: Tag) -> None: self.text += self.name2char[tag_name] continue - if tag_name == 't' and tag.text: + if tag_name == "t" and tag.text: self.text += tag.text - elif tag_name == 'sym': + elif tag_name == "sym": try: - self.text += chr(int("0x" + tag['w:char'], 16)) + self.text += chr(int("0x" + tag["w:char"], 16)) except KeyError: pass @@ -45,14 +46,14 @@ def get_text(self, xml: Tag) -> None: self.text = self.text.upper() def __repr__(self) -> str: - return "Run({})".format(self.text[:30].replace("\n", r"\n")) + text = self.text[:30].replace("\n", r"\n") + return f"Run({text})" def __eq__(self, other: "Run") -> bool: if not isinstance(other, Run): return False - return (self.size == other.size and - self.bold == other.bold and - self.italic == other.italic and - self.underlined == other.underlined and - self.superscript == other.superscript and - self.subscript == other.subscript) + + size_eq = self.size == other.size + font_eq = self.bold == other.bold and self.italic == other.italic and self.underlined == other.underlined + script_eq = self.superscript == other.superscript and self.subscript == other.subscript + return size_eq and font_eq and script_eq diff --git a/dedoc/readers/docx_reader/data_structures/table.py b/dedoc/readers/docx_reader/data_structures/table.py index 3c99ab4e..f20fe7cb 100644 --- a/dedoc/readers/docx_reader/data_structures/table.py +++ b/dedoc/readers/docx_reader/data_structures/table.py @@ -1,5 +1,6 @@ import hashlib from collections import namedtuple + from bs4 import Tag from dedoc.data_structures.table import Table @@ -7,7 +8,7 @@ from dedoc.readers.docx_reader.data_structures.run import Run from dedoc.readers.docx_reader.styles_extractor import StylesExtractor -CellPropertyInfo = namedtuple('NamedTuple', 'colspan, rowspan, invisible') +CellPropertyInfo = namedtuple("NamedTuple", "colspan, rowspan, invisible") class DocxTable: @@ -65,7 +66,7 @@ def to_table(self) -> Table: cell_property_row_list.append(CellPropertyInfo(grid_span, 1, False)) # split merged cells - for span in range(grid_span - 1): + for _ in range(grid_span - 1): cell_property_row_list.append(CellPropertyInfo(1, 1, True)) cell_ind += 1 cells_text.append(cell_text) @@ -87,7 +88,7 @@ def __get_cell_text(self, cell: Tag) -> str: run = Run(None, self.styles_extractor) run.get_text(run_bs) cell_text += run.text - cell_text += '\n' + cell_text += "\n" if cell_text: cell_text = cell_text[:-1] # remove \n in the end diff --git a/dedoc/readers/docx_reader/data_structures/utils.py b/dedoc/readers/docx_reader/data_structures/utils.py index 05c1529c..85bdbc53 100644 --- a/dedoc/readers/docx_reader/data_structures/utils.py +++ b/dedoc/readers/docx_reader/data_structures/utils.py @@ -8,8 +8,8 @@ class Counter: def __init__(self, body: Tag, logger: logging.Logger) -> None: self.logger = logger - self.total_paragraph_number = sum([len(p.find_all('w:p')) for p in body if p.name != 'p' and p.name != "tbl" and isinstance(p, Tag)]) - self.total_paragraph_number += len([p for p in body if p.name == 'p' and isinstance(p, Tag)]) + self.total_paragraph_number = sum([len(p.find_all("w:p")) for p in body if p.name != "p" and p.name != "tbl" and isinstance(p, Tag)]) + self.total_paragraph_number += len([p for p in body if p.name == "p" and isinstance(p, Tag)]) self.current_paragraph_number = 0 self.checkpoint_time = time.time() diff --git a/dedoc/readers/docx_reader/docx_reader.py b/dedoc/readers/docx_reader/docx_reader.py index 814cfdc3..552b09c5 100644 --- a/dedoc/readers/docx_reader/docx_reader.py +++ b/dedoc/readers/docx_reader/docx_reader.py @@ -1,6 +1,6 @@ import logging import os -from typing import Optional, List +from typing import List, Optional from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor from dedoc.data_structures.hierarchy_level import HierarchyLevel @@ -47,11 +47,11 @@ def __fix_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]: for i, line in enumerate(lines[1:]): if lines[i].metadata.tag_hierarchy_level != line.metadata.tag_hierarchy_level \ or lines[i].metadata.tag_hierarchy_level.line_type != HierarchyLevel.unknown \ - or lines[i].line.endswith('\n'): + or lines[i].line.endswith("\n"): continue old_len = len(lines[i].line) - lines[i].set_line(lines[i].line + '\n') + lines[i].set_line(lines[i].line + "\n") for annotation in lines[i].annotations: if annotation.end == old_len: diff --git a/dedoc/readers/docx_reader/footnote_extractor.py b/dedoc/readers/docx_reader/footnote_extractor.py index 2e6c12cf..1eb6732e 100644 --- a/dedoc/readers/docx_reader/footnote_extractor.py +++ b/dedoc/readers/docx_reader/footnote_extractor.py @@ -14,7 +14,7 @@ def __init__(self, xml: Optional[BeautifulSoup], key: str = "footnote") -> None: if not xml: return - for footnote in xml.find_all("w:{}".format(key)): + for footnote in xml.find_all(f"w:{key}"): footnote_id = footnote.get("w:id") footnote_text = " ".join(t.text for t in footnote.find_all("w:t") if t.text) if footnote_id and footnote_text: diff --git a/dedoc/readers/docx_reader/line_with_meta_converter.py b/dedoc/readers/docx_reader/line_with_meta_converter.py index 7b7f79f3..eb65b3eb 100644 --- a/dedoc/readers/docx_reader/line_with_meta_converter.py +++ b/dedoc/readers/docx_reader/line_with_meta_converter.py @@ -47,7 +47,7 @@ def __parse(self, paragraph: Paragraph, paragraph_id: int) -> LineWithMeta: for run, (start, end) in zip(paragraph.runs, paragraph.runs_ids): annotations.append(SizeAnnotation(start=start, end=end, value=str(run.size / 2))) - for property_name in ['bold', 'italic', 'underlined', 'strike', 'superscript', 'subscript']: + for property_name in ["bold", "italic", "underlined", "strike", "superscript", "subscript"]: property_value = getattr(run, property_name) if property_value: annotations.append(self.dict2annotation[property_name](start=start, end=end, value=str(property_value))) diff --git a/dedoc/readers/docx_reader/numbering_extractor.py b/dedoc/readers/docx_reader/numbering_extractor.py index bbbeeb4d..37fe8591 100644 --- a/dedoc/readers/docx_reader/numbering_extractor.py +++ b/dedoc/readers/docx_reader/numbering_extractor.py @@ -1,11 +1,11 @@ import re -from typing import List, Dict, Optional +from typing import Dict, List, Optional -from bs4 import Tag, BeautifulSoup +from bs4 import BeautifulSoup, Tag from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties from dedoc.readers.docx_reader.properties_extractor import change_paragraph_properties, change_run_properties -from dedoc.readers.docx_reader.styles_extractor import StylesExtractor, StyleType +from dedoc.readers.docx_reader.styles_extractor import StyleType, StylesExtractor from dedoc.readers.docx_reader.windows_font_mapping import windows_mapping @@ -31,8 +31,8 @@ def __init__(self, xml: Optional[BeautifulSoup], styles_extractor: StylesExtract self.numbering_formatter = NumberingFormatter() self.state = NumberingState() - abstract_num_dict = {abstract_num['w:abstractNumId']: abstract_num for abstract_num in xml.find_all('w:abstractNum')} - num_dict = {num['w:numId']: num for num in xml.find_all('w:num')} + abstract_num_dict = {abstract_num["w:abstractNumId"]: abstract_num for abstract_num in xml.find_all("w:abstractNum")} + num_dict = {num["w:numId"]: num for num in xml.find_all("w:num")} # dictionary with num properties self.num_dict = {num_id: Num(num_id, abstract_num_dict, num_dict, styles_extractor) for num_id in num_dict} @@ -49,14 +49,14 @@ def parse(self, xml: Tag, paragraph_properties: BaseProperties, run_properties: return ilvl, num_id = xml.ilvl, xml.numId - if not num_id or num_id['w:val'] not in self.num_dict: + if not num_id or num_id["w:val"] not in self.num_dict: return - num_id = num_id['w:val'] + num_id = num_id["w:val"] # find list level if not ilvl: try: - style_id = xml['w:styleId'] + style_id = xml["w:styleId"] num = self.num_dict[num_id] # find link on this styleId in the levels list for level_num, level_info in num.level_number2level_info.items(): @@ -65,7 +65,7 @@ def parse(self, xml: Tag, paragraph_properties: BaseProperties, run_properties: except KeyError: return else: - ilvl = ilvl['w:val'] + ilvl = ilvl["w:val"] lvl_info: LevelInfo = self.num_dict[num_id].level_number2level_info[ilvl] text = self.__get_list_item_text(ilvl, num_id) @@ -97,7 +97,7 @@ def __get_list_item_text(self, ilvl: str, num_id: str) -> str: lvl_info: LevelInfo = self.num_dict[num_id].level_number2level_info[ilvl] # the other list started if self.state.prev_abstract_num_id and self.state.prev_num_id and self.state.prev_abstract_num_id != abstract_num_id \ - and self.num_dict[self.state.prev_num_id].properties['restart']: + and self.num_dict[self.state.prev_num_id].properties["restart"]: del self.state.prev_ilvl_dict[self.state.prev_abstract_num_id] # there is the information about this list @@ -130,12 +130,12 @@ def __get_list_item_text(self, ilvl: str, num_id: str) -> str: self.state.prev_num_id = num_id text = lvl_info.lvl_text - levels = re.findall(r'%\d+', text) + levels = re.findall(r"%\d+", text) for level in levels: - # level = '%level' + # level = "%level" level = level[1:] next_number = self.__get_next_number(num_id, level) - text = re.sub(r'%\d+', next_number, text, count=1) + text = re.sub(r"%\d+", next_number, text, count=1) text += lvl_info.suff return text @@ -176,7 +176,7 @@ class NumberingFormatter: upperLetter="A", # A, B, C, ..., Y, Z, AA, BB, CC, ..., YY, ZZ, AAA, BBB, CCC, ... upperRoman="I", # I, II, III, IV, ..., XVIII, XIX, XX, XXI, ... ) - roman_mapping = [(1000, 'm'), (500, 'd'), (100, 'c'), (50, 'l'), (10, 'x'), (5, 'v'), (1, 'i')] + roman_mapping = [(1000, "m"), (500, "d"), (100, "c"), (50, "l"), (10, "x"), (5, "v"), (1, "i")] def get_text(self, num_fmt: str, shift: int) -> str: """ @@ -206,7 +206,7 @@ def get_text(self, num_fmt: str, shift: int) -> str: for number, letter in self.roman_mapping: cnt, shift = shift // number, shift % number if num_fmt == "upperRoman": - letter = chr(ord(letter) + ord('A') - ord('a')) + letter = chr(ord(letter) + ord("A") - ord("a")) result += letter * cnt return result @@ -264,16 +264,16 @@ def __init__(self, tree: Tag, styles_extractor: StylesExtractor) -> None: :param styles_extractor: StylesExtractor """ self.styles_extractor = styles_extractor - self.abstract_num_id = tree['w:abstractNumId'] + self.abstract_num_id = tree["w:abstractNumId"] # properties for all levels {"styleLink", "restart"}, styleLink-> abstractNumId of the other numbering - self.properties = {'styleLink': tree.numStyleLink['w:val'] if tree.numStyleLink else None} + self.properties = {"styleLink": tree.numStyleLink["w:val"] if tree.numStyleLink else None} try: - if tree['w15:restartNumberingAfterBreak']: - self.properties['restart'] = bool(int(tree['w15:restartNumberingAfterBreak'])) + if tree["w15:restartNumberingAfterBreak"]: + self.properties["restart"] = bool(int(tree["w15:restartNumberingAfterBreak"])) except KeyError: - self.properties['restart'] = False + self.properties["restart"] = False # properties for each list level {level number: LevelInfo} self.level_number2level_info = dict() @@ -285,34 +285,34 @@ def parse(self, lvl_list: List[Tag]) -> None: """ for lvl in lvl_list: - ilvl = lvl['w:ilvl'] + ilvl = lvl["w:ilvl"] level_info = self.level_number2level_info.get(ilvl, LevelInfo()) - if lvl.lvlText and lvl.lvlText['w:val']: # lvlText (val="some text %num some text") + if lvl.lvlText and lvl.lvlText["w:val"]: # lvlText (val="some text %num some text") # some characters in bullets are displayed incorrectly, replace them with the unicode equivalent - hex_text = hex(ord(lvl.lvlText['w:val'][0])) - level_info.lvl_text = windows_mapping.get(hex_text, lvl.lvlText['w:val']) + hex_text = hex(ord(lvl.lvlText["w:val"][0])) + level_info.lvl_text = windows_mapping.get(hex_text, lvl.lvlText["w:val"]) if lvl.isLgl: - level_info.num_fmt = 'decimal' + level_info.num_fmt = "decimal" elif lvl.numFmt: # numFmt (val="bullet", "decimal", ...) - level_info.num_fmt = lvl.numFmt['w:val'] + level_info.num_fmt = lvl.numFmt["w:val"] if lvl.start: - level_info.start = int(lvl.start['w:val']) + level_info.start = int(lvl.start["w:val"]) if lvl.lvlRestart: - level_info.lvl_restart = bool(int(lvl.lvlRestart['w:val'])) + level_info.lvl_restart = bool(int(lvl.lvlRestart["w:val"])) if level_info.restart is None: - level_info.restart = self.properties['restart'] + level_info.restart = self.properties["restart"] if lvl.suff: # suff (w:val="nothing", "tab" - default, "space") - level_info.suff = self.suffix_dict[lvl.suff['w:val']] + level_info.suff = self.suffix_dict[lvl.suff["w:val"]] # extract information from paragraphs and runs properties if lvl.pStyle: - level_info.style_id = lvl.pStyle['w:val'] + level_info.style_id = lvl.pStyle["w:val"] # paragraph -> run if lvl.pPr: @@ -323,7 +323,7 @@ def parse(self, lvl_list: List[Tag]) -> None: if lvl.startOverride: level_info.restart = True - level_info.start = int(lvl.startOverride['w:val']) + level_info.start = int(lvl.startOverride["w:val"]) self.level_number2level_info[ilvl] = level_info @@ -343,19 +343,19 @@ def __init__(self, """ self.num_id = num_id num_tree = num_dict[num_id] - abstract_num_tree = abstract_num_dict[num_tree.abstractNumId['w:val']] + abstract_num_tree = abstract_num_dict[num_tree.abstractNumId["w:val"]] super().__init__(abstract_num_tree, styles_extractor) # create properties # extract the information from numStyleLink - while self.properties['styleLink']: + while self.properties["styleLink"]: for abstract_num in abstract_num_dict.values(): - if abstract_num.find('w:styleLink', attrs={'w:val': self.properties['styleLink']}): + if abstract_num.find("w:styleLink", attrs={"w:val": self.properties["styleLink"]}): abstract_num_tree = abstract_num break super().__init__(abstract_num_tree, styles_extractor) - self.parse(abstract_num_tree.find_all('w:lvl')) + self.parse(abstract_num_tree.find_all("w:lvl")) # override some of abstractNum properties if num_tree.lvlOverride: - lvl_list = num_tree.find_all('w:lvlOverride') + lvl_list = num_tree.find_all("w:lvlOverride") self.parse(lvl_list) diff --git a/dedoc/readers/docx_reader/properties_extractor.py b/dedoc/readers/docx_reader/properties_extractor.py index cc6c3629..016930a8 100644 --- a/dedoc/readers/docx_reader/properties_extractor.py +++ b/dedoc/readers/docx_reader/properties_extractor.py @@ -4,7 +4,7 @@ def check_if_true(value: str) -> bool: - if value == '1' or value == 'True' or value == 'true': + if value == "1" or value == "True" or value == "true": return True return False @@ -40,7 +40,7 @@ def change_run_properties(old_properties: BaseProperties, tree: Tag) -> None: if tree.u: u_tag = tree.u.get("w:val", False) - if u_tag == 'none': + if u_tag == "none": old_properties.underlined = False elif isinstance(u_tag, str): old_properties.underlined = True @@ -76,7 +76,7 @@ def change_indent(old_properties: BaseProperties, tree: Tag) -> None: attributes = {attribute: 0 for attribute in ["firstLine", "firstLineChars", "hanging", "hangingChars", "start", "startChars", "left"]} for attribute in attributes: - attributes[attribute] = float(tree.ind.get("w:{}".format(attribute), 0)) + attributes[attribute] = float(tree.ind.get(f"w:{attribute}", 0)) indentation = 0 if attributes["left"] != 0: @@ -106,7 +106,7 @@ def change_size(old_properties: BaseProperties, tree: Tag) -> None: :param tree: BeautifulSoup tree with properties """ if tree.sz: - old_properties.size = int(tree.sz.get('w:val', old_properties.size)) + old_properties.size = int(tree.sz.get("w:val", old_properties.size)) def change_jc(old_properties: BaseProperties, tree: Tag) -> None: @@ -120,23 +120,23 @@ def change_jc(old_properties: BaseProperties, tree: Tag) -> None: return if tree.bidi: - bidi_tag = tree.bidi.get('w:val', True) + bidi_tag = tree.bidi.get("w:val", True) right_to_left = check_if_true(bidi_tag) if isinstance(bidi_tag, str) else bidi_tag else: right_to_left = False - jc_tag = tree.jc.get('w:val', old_properties.jc) + jc_tag = tree.jc.get("w:val", old_properties.jc) - if jc_tag == 'both': - old_properties.jc = 'both' - elif jc_tag == 'center': - old_properties.jc = 'center' - elif jc_tag == 'right': - old_properties.jc = 'right' - elif jc_tag == 'end' and not right_to_left: - old_properties.jc = 'right' - elif jc_tag == 'start' and right_to_left: - old_properties.jc = 'right' + if jc_tag == "both": + old_properties.jc = "both" + elif jc_tag == "center": + old_properties.jc = "center" + elif jc_tag == "right": + old_properties.jc = "right" + elif jc_tag == "end" and not right_to_left: + old_properties.jc = "right" + elif jc_tag == "start" and right_to_left: + old_properties.jc = "right" def change_caps(old_properties: BaseProperties, tree: Tag) -> None: @@ -148,7 +148,7 @@ def change_caps(old_properties: BaseProperties, tree: Tag) -> None: if not tree.caps: return - caps_tag = tree.caps.get('w:val', True) + caps_tag = tree.caps.get("w:val", True) old_properties.caps = check_if_true(caps_tag) if isinstance(caps_tag, str) else caps_tag diff --git a/dedoc/readers/docx_reader/styles_extractor.py b/dedoc/readers/docx_reader/styles_extractor.py index ca6a108e..ec6a0cbf 100644 --- a/dedoc/readers/docx_reader/styles_extractor.py +++ b/dedoc/readers/docx_reader/styles_extractor.py @@ -1,9 +1,9 @@ import logging import re from enum import Enum -from typing import Optional, List +from typing import List, Optional -from bs4 import Tag, BeautifulSoup +from bs4 import BeautifulSoup, Tag from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties from dedoc.readers.docx_reader.data_structures.run import Run @@ -33,12 +33,12 @@ def __init__(self, xml: Optional[BeautifulSoup], logger: logging.Logger) -> None # extract information from docDefaults # docDefaults: rPrDefault + pPrDefault self.doc_defaults = self.styles.docDefaults - self.default_style = self.styles.find_all('w:style', attrs={'w:default': "1", 'w:type': "paragraph"}) + self.default_style = self.styles.find_all("w:style", attrs={"w:default": "1", "w:type": "paragraph"}) self.default_style = self.default_style[0] if self.default_style else None self.__styles_cache = {} self.__styles_hierarchy_cache = {} - self.style_regexp = re.compile(r'heading\s*(\d+)') + self.style_regexp = re.compile(r"heading\s*(\d+)") def parse(self, style_id: Optional[str], old_properties: BaseProperties, style_type: StyleType) -> None: """ @@ -82,7 +82,7 @@ def parse(self, style_id: Optional[str], old_properties: BaseProperties, style_t try: numbering_run = Run(old_properties, self) self.numbering_extractor.parse(style, old_properties, numbering_run) - if hasattr(old_properties, 'runs'): + if hasattr(old_properties, "runs"): old_properties.runs.append(numbering_run) except KeyError as error: self.logger.info(error) @@ -101,7 +101,7 @@ def __get_styles_hierarchy(self, style: Tag, style_id: str, style_type: StyleTyp current_style = style while current_style.basedOn: try: - parent_style_id = current_style.basedOn['w:val'] + parent_style_id = current_style.basedOn["w:val"] current_style = self.__find_style(parent_style_id, style_type) if current_style: styles.append(current_style) @@ -136,7 +136,7 @@ def __find_style(self, style_id: str, style_type: StyleType) -> Optional[Tag]: if key in self.__styles_cache: return self.__styles_cache[key] - styles = self.styles.find_all('w:style', attrs={'w:styleId': style_id, 'w:type': style_type.value}) + styles = self.styles.find_all("w:style", attrs={"w:styleId": style_id, "w:type": style_type.value}) if not styles: return None diff --git a/dedoc/readers/email_reader/email_reader.py b/dedoc/readers/email_reader/email_reader.py index 467a88d6..368bf27a 100644 --- a/dedoc/readers/email_reader/email_reader.py +++ b/dedoc/readers/email_reader/email_reader.py @@ -1,4 +1,5 @@ import email +import json import logging import mimetypes import os @@ -7,17 +8,16 @@ from email.header import decode_header from email.message import Message from tempfile import NamedTemporaryFile -from typing import Optional, List -import json +from typing import List, Optional from dedoc.data_structures.attached_file import AttachedFile -from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader -from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.utils.utils import save_data_to_unique_file, get_unique_name from dedoc.readers.html_reader.html_reader import HtmlReader +from dedoc.utils.utils import get_unique_name, save_data_to_unique_file class EmailReader(BaseReader): @@ -55,17 +55,17 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio all_header_fields = dict(msg.items()) lines = self.__get_main_fields(msg) - header_filename = "message_header_" + get_unique_name('message_header.json') + header_filename = "message_header_" + get_unique_name("message_header.json") # saving message header into separated file as an attachment header_file_path = os.path.join(os.path.dirname(path), header_filename) - with open(header_file_path, 'w', encoding='utf-8') as f: + with open(header_file_path, "w", encoding="utf-8") as f: json.dump(all_header_fields, f, ensure_ascii=False, indent=4) need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true" attachments.append(AttachedFile(original_name=header_filename, tmp_file_path=header_file_path, - uid="attach_{}".format(uuid.uuid1()), + uid=f"attach_{uuid.uuid1()}", need_content_analysis=need_content_analysis)) html_found = False @@ -111,22 +111,22 @@ def __add_attachment(self, message: Message, path: str, attachments: list, need_ return filename = message.get_filename() - filename = '' if filename is None else self.__get_decoded(filename) + filename = "" if filename is None else self.__get_decoded(filename) filename, extension = os.path.splitext(filename) filename = self.__fix_filename(filename) - filename = str(uuid.uuid4()) if filename == '' else filename + filename = str(uuid.uuid4()) if filename == "" else filename fixed_extension = self.__fix_filename(extension) - if extension == '' or fixed_extension != extension: + if extension == "" or fixed_extension != extension: extension = mimetypes.guess_extension(content_type) - extension = '.txt' if extension == '.bat' else extension + extension = ".txt" if extension == ".bat" else extension tmpdir = os.path.dirname(path) filename = f"{filename}{extension}" tmp_file_name = save_data_to_unique_file(directory=tmpdir, filename=filename, binary_data=payload) attachments.append(AttachedFile(original_name=filename, tmp_file_path=os.path.join(tmpdir, tmp_file_name), - uid="attach_{}".format(uuid.uuid1()), + uid=f"attach_{uuid.uuid1()}", need_content_analysis=need_content_analysis)) def __add_content_from_html(self, message: Message, lines: list, tables: list) -> None: diff --git a/dedoc/readers/excel_reader/excel_reader.py b/dedoc/readers/excel_reader/excel_reader.py index eff279ee..7e290316 100644 --- a/dedoc/readers/excel_reader/excel_reader.py +++ b/dedoc/readers/excel_reader/excel_reader.py @@ -1,5 +1,6 @@ import os from typing import Optional + import xlrd from xlrd.sheet import Sheet @@ -42,9 +43,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio sheet = book.sheet_by_index(sheet_num) tables.append(self.__parse_sheet(sheet_num, sheet)) if self.attachment_extractor.with_attachments(parameters=parameters): - attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), - filename=os.path.basename(path), - parameters=parameters) + attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) else: attachments = [] return UnstructuredDocument(lines=[], tables=tables, attachments=attachments, warnings=[]) diff --git a/dedoc/readers/html2pdf_reader/html2pdf_reader.py b/dedoc/readers/html2pdf_reader/html2pdf_reader.py index 5430e384..c1ec3aa4 100644 --- a/dedoc/readers/html2pdf_reader/html2pdf_reader.py +++ b/dedoc/readers/html2pdf_reader/html2pdf_reader.py @@ -3,7 +3,7 @@ import re from copy import deepcopy from tempfile import TemporaryDirectory -from typing import Optional, Dict, Tuple +from typing import Dict, Optional, Tuple from uuid import uuid1 from bs4 import BeautifulSoup @@ -30,7 +30,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio modified_path, tables = self._modify_html(path, tmp_dir) converted_path = os.path.join(tmp_dir, os.path.basename(path).replace(".html", ".pdf")) HTML(filename=modified_path).write_pdf(converted_path) - self.logger.info("Convert {} to {}".format(modified_path, converted_path)) + self.logger.info(f"Convert {modified_path} to {converted_path}") parameters_new = deepcopy(parameters) parameters_new["pdf_with_text_layer"] = "true" unstructured_document = self.pdf_reader.read(path=converted_path, document_type=document_type, parameters=parameters_new) @@ -57,8 +57,8 @@ def _add_tables(self, document: UnstructuredDocument, tables: Dict[str, Table]) def _handle_tables(self, soup: BeautifulSoup) -> dict: tables = {} - for table_id, table_tag in enumerate(soup.find_all("table")): - table_uid = "table_{}".format(uuid1()) + for table_tag in soup.find_all("table"): + table_uid = f"table_{uuid1()}" table = self._read_table(table_tag) table.metadata.uid = table_uid tables[table_uid] = table @@ -88,7 +88,7 @@ def _handle_super_elements(self, soup: BeautifulSoup) -> None: super_element.decompose() def _modify_html(self, path: str, tmp_dir: str) -> Tuple[str, dict]: - with open(path, encoding='utf-8') as f: + with open(path, encoding="utf-8") as f: soup = BeautifulSoup(f.read(), "html.parser") tables = self._handle_tables(soup) diff --git a/dedoc/readers/html_reader/html_reader.py b/dedoc/readers/html_reader/html_reader.py index e334360f..08bcbda1 100644 --- a/dedoc/readers/html_reader/html_reader.py +++ b/dedoc/readers/html_reader/html_reader.py @@ -2,21 +2,22 @@ import logging import string import uuid -from typing import Optional, List, Union +from typing import List, Optional, Union from bs4 import BeautifulSoup -from bs4 import Tag, Doctype, Comment -from dedoc.data_structures.line_with_meta import LineWithMeta +from bs4 import Comment, Doctype, Tag + +from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.table import Table from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader -from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.utils.utils import calculate_file_hash from dedoc.readers.html_reader.html_line_postprocessing import HtmlLinePostprocessing from dedoc.readers.html_reader.html_tag_annotation_parser import HtmlTagAnnotationParser from dedoc.readers.html_reader.html_tags import HtmlTags +from dedoc.utils.utils import calculate_file_hash class HtmlReader(BaseReader): @@ -46,7 +47,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters - with open(path, 'rb') as f: + with open(path, "rb") as f: soup = BeautifulSoup(f.read(), "html.parser") handle_invisible_table = str(parameters.get("handle_invisible_table", "false")).lower() == "true" @@ -68,7 +69,7 @@ def __handle_block(self, tag: Union[Tag], uid: str, handle_invisible_table: bool elif isinstance(tag, str): block_lines = self._handle_text_line(block=tag, path_hash=uid) elif tag.name not in HtmlTags.available_tags: - self.logger.debug("skip tag {}".format(tag.name.encode())) + self.logger.debug(f"skip tag {tag.name.encode()}") block_lines = [] elif tag.name in HtmlTags.special_symbol_tags: tag_value = HtmlTags.special_symbol_tags[tag.name] @@ -94,12 +95,7 @@ def __handle_single_tag(self, tag: Tag, uid: str) -> List[LineWithMeta]: header_level = int(tag.name[1:]) if tag.name in HtmlTags.header_tags else 0 line_type = HierarchyLevel.unknown if header_level == 0 else HierarchyLevel.header tag_uid = hashlib.md5((uid + text).encode()).hexdigest() - line = self.__make_line(line=text, - line_type=line_type, - header_level=header_level, - uid=tag_uid, - path_hash=uid, - annotations=annotations) + line = self.__make_line(line=text, line_type=line_type, header_level=header_level, uid=tag_uid, path_hash=uid, annotations=annotations) line.metadata.extend_other_fields({"html_tag": tag.name}) return [line] @@ -138,11 +134,11 @@ def __make_line(self, line: str, level = None if header_level == 0 else HierarchyLevel(1, header_level, False, line_type=line_type) metadata = LineMetadata(page_id=0, line_id=None, tag_hierarchy_level=level) # TODO line_id - uid = "{}_{}".format(path_hash, uid) + uid = f"{path_hash}_{uid}" return LineWithMeta(line=line, metadata=metadata, annotations=annotations, uid=uid) def __get_li_header(self, list_type: str, index: int) -> LineWithMeta: - end = ') ' if list_type in ["a", 'A'] else '. ' + end = ") " if list_type in ["a", "A"] else ". " if list_type == "": header = "" @@ -200,7 +196,7 @@ def __handle_list_item(self, # not currently used, but may be useful in the future def __get_text(self, tag: Tag) -> [str, int, int]: - text = tag.getText() + '\n' if tag.name == "p" else tag.getText() + text = tag.getText() + "\n" if tag.name == "p" else tag.getText() text = "" if text is None else text return text @@ -221,14 +217,11 @@ def __handle_invisible_table(self, block: Tag, path_hash: str) -> List[LineWithM uid = hashlib.md5(block.name.encode()).hexdigest() result = [] rows = self._read_table(block).cells - for row_id, row in enumerate(rows): + for row in rows: text = " ".join(row) if text.strip() != "": tag_uid = hashlib.md5((uid + text).encode()).hexdigest() - line = self.__make_line(line=text, - line_type=HierarchyLevel.unknown, - uid=tag_uid, - path_hash=path_hash) + line = self.__make_line(line=text, line_type=HierarchyLevel.unknown, uid=tag_uid, path_hash=path_hash) result.append(line) return result @@ -242,7 +235,7 @@ def _read_table(self, table: Tag) -> Table: def _visible_table(self, table: Tag, handle_invisible_table: bool) -> bool: if handle_invisible_table: return True - assert table.name == "table", "block {} is not table".format(table) + assert table.name == "table", f"block {table} is not table" for td in table.find_all("td"): style = td.attrs.get("style", "") if "border-bottom-style:solid" in style or "border-top-style:solid" in style: diff --git a/dedoc/readers/html_reader/html_tag_annotation_parser.py b/dedoc/readers/html_reader/html_tag_annotation_parser.py index 251e2da1..35fe8bcf 100644 --- a/dedoc/readers/html_reader/html_tag_annotation_parser.py +++ b/dedoc/readers/html_reader/html_tag_annotation_parser.py @@ -1,6 +1,7 @@ -from typing import List, Union, Optional, Tuple +from typing import List, Optional, Tuple, Union from bs4 import Tag + from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.concrete_annotations.alignment_annotation import AlignmentAnnotation from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation @@ -38,8 +39,8 @@ def __parse_annotations(self, tag: Tag, start: int = 0) -> Tuple[int, List[Annot annotations.extend(self.__create_annotations(tag, start, start + curr_len)) - if 'style' in tag.attrs.keys(): - annotations.extend(self.__parse_style_string(tag.attrs['style'], start, start + curr_len)) + if "style" in tag.attrs.keys(): + annotations.extend(self.__parse_style_string(tag.attrs["style"], start, start + curr_len)) return curr_len, annotations @@ -64,47 +65,56 @@ def __create_annotations(self, tag: Union[str, Tag], start: int, end: int) -> Li def __parse_style_string(self, styles_string: str, start: int, end: int) -> List[Annotation]: annotations = [] - styles_list = styles_string.split(';') + styles_list = styles_string.split(";") + for st in styles_list: st = st.strip() if not st: continue - pair = st.split(':') + pair = st.split(":") if len(pair) != 2: continue - key, value = st.split(':') + key, value = st.split(":") value = value.strip() - if key == 'font-style': - if value == 'italic': - annotations.append(ItalicAnnotation(start, end, value="True")) - elif key == 'font-weight': - if value == 'bold': - annotations.append(BoldAnnotation(start, end, value="True")) - elif key == 'font-size': - font_size = self.__parse_font_size_style(value) - - if font_size is not None: - annotations.append(SizeAnnotation(start, end, value=font_size)) - elif key == 'text-align': - if value in ['justify', 'inherit', 'auto']: - continue - elif value in AlignmentAnnotation.valid_values: - annotations.append(AlignmentAnnotation(start, end, value=value)) - elif value in ['start', 'end']: # additional fields for left - annotations.append(AlignmentAnnotation(start, end, value="left")) - else: - continue - elif key == 'font-family': - annotations.append(StyleAnnotation(start, end, value=value)) - elif key == 'display': - if value in {"none", "hidden"}: - annotations.append(StyleAnnotation(start, end, value="hidden")) + annotation = self.__get_annotation(key=key, value=value, start=start, end=end) + if annotation: + annotations.append(annotation) return annotations + def __get_annotation(self, key: str, value: str, start: int, end: int) -> Optional[Annotation]: + if key == "font-style": + annotation = ItalicAnnotation(start, end, value="True") if value == "italic" else None + return annotation + + if key == "font-weight": + annotation = BoldAnnotation(start, end, value="True") if value == "bold" else None + return annotation + + if key == "font-size": + font_size = self.__parse_font_size_style(value) + annotation = SizeAnnotation(start, end, value=font_size) if font_size is not None else None + return annotation + + if key == "text-align": + annotation = None + if value in AlignmentAnnotation.valid_values: + annotation = AlignmentAnnotation(start, end, value=value) + elif value in ["start", "end"]: # additional fields for left + annotation = AlignmentAnnotation(start, end, value="left") + + return annotation + + if key == "font-family": + return StyleAnnotation(start, end, value=value) + + if key == "display": + annotation = StyleAnnotation(start, end, value="hidden") if value in {"none", "hidden"} else None + return annotation + def __parse_font_size_style(self, value: str) -> Optional[str]: if value.endswith("pt"): return value[:-2] diff --git a/dedoc/readers/json_reader/json_reader.py b/dedoc/readers/json_reader/json_reader.py index e95d1dff..623830a2 100644 --- a/dedoc/readers/json_reader/json_reader.py +++ b/dedoc/readers/json_reader/json_reader.py @@ -1,12 +1,12 @@ import os from json import JSONDecodeError -from typing import Optional, List, Any +from typing import Any, List, Optional import ujson as json from dedoc.attachments_extractors.concrete_attachments_extractors.json_attachment_extractor import JsonAttachmentsExtractor -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException -from dedoc.common.exceptions.bad_parameters_exception import BadParametersException +from dedoc.common.exceptions.bad_file_error import BadFileFormatError +from dedoc.common.exceptions.bad_parameters_error import BadParametersError from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta @@ -42,18 +42,16 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio try: json_data = json.load(file) except (JSONDecodeError, ValueError): - raise BadFileFormatException(msg="Seems that json is invalid") + raise BadFileFormatError(msg="Seems that json is invalid") if "html_fields" in parameters: fields = parameters.get("html_fields", "[]") try: key_fields = json.loads(fields if fields else "[]") except (JSONDecodeError, ValueError): - raise BadParametersException("can't read html_fields {}".format(fields)) + raise BadParametersError(f"can't read html_fields {fields}") json_data = self.__exclude_html_fields(json_data, key_fields) - attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), - filename=os.path.basename(path), - parameters=parameters) + attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) else: attachments = [] @@ -67,10 +65,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio if isinstance(element, list) and len(element) > 0: self.__handle_list(depth, element, result, stack) elif self.__is_flat(element): - line = self.__handle_one_element(depth=depth, - value=str(element), - line_type=HierarchyLevel.raw_text, - line_type_meta=HierarchyLevel.raw_text) + line = self.__handle_one_element(depth=depth, value=str(element), line_type=HierarchyLevel.raw_text, line_type_meta=HierarchyLevel.raw_text) result.append(line) return UnstructuredDocument(tables=[], lines=result, attachments=attachments) @@ -97,10 +92,7 @@ def __exclude_key(self, json_data: dict, keys: List[str]) -> None: def __handle_list(self, depth: int, element: list, result: list, stack: list) -> None: for _ in range(len(element)): sub_element = element.pop(0) - line = self.__handle_one_element(depth=depth, - value=sub_element, - line_type=HierarchyLevel.list_item, - line_type_meta=HierarchyLevel.list_item) + line = self.__handle_one_element(depth=depth, value=sub_element, line_type=HierarchyLevel.list_item, line_type_meta=HierarchyLevel.list_item) result.append(line) if not self.__is_flat(sub_element): stack.append((element, depth)) @@ -111,10 +103,7 @@ def __handle_dict(self, depth: int, element: dict, result: list, stack: list) -> for key in sorted(element.keys()): # key = min(element.keys()) if len(element) < 100 else list(element.keys())[0] value = element.pop(key) - line = self.__handle_one_element(depth=depth, - value=key, - line_type="key", - line_type_meta="key") + line = self.__handle_one_element(depth=depth, value=key, line_type="key", line_type_meta="key") result.append(line) stack.append((element, depth)) @@ -122,7 +111,7 @@ def __handle_dict(self, depth: int, element: dict, result: list, stack: list) -> stack.append((value, depth + 1)) break - def __handle_one_element(self, depth: int, value: Any, line_type: str, line_type_meta: str) -> LineWithMeta: + def __handle_one_element(self, depth: int, value: Any, line_type: str, line_type_meta: str) -> LineWithMeta: # noqa if depth == 1 and line_type == "title": level1, level2 = 0, 0 else: @@ -133,10 +122,10 @@ def __handle_one_element(self, depth: int, value: Any, line_type: str, line_type line = LineWithMeta(line=self.__get_text(value), metadata=metadata, annotations=[]) return line - def __is_flat(self, value: Any) -> bool: + def __is_flat(self, value: Any) -> bool: # noqa return not isinstance(value, (dict, list)) - def __get_text(self, value: Any) -> str: + def __get_text(self, value: Any) -> str: # noqa if isinstance(value, (dict, list)) or value is None: return "" diff --git a/dedoc/readers/mhtml_reader/mhtml_reader.py b/dedoc/readers/mhtml_reader/mhtml_reader.py index 68174ce1..3ce1d83c 100644 --- a/dedoc/readers/mhtml_reader/mhtml_reader.py +++ b/dedoc/readers/mhtml_reader/mhtml_reader.py @@ -5,17 +5,18 @@ import shutil import tempfile import uuid -from typing import Optional, List +from typing import List, Optional from urllib.parse import urlparse from bs4 import BeautifulSoup + from dedoc.data_structures.attached_file import AttachedFile from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader +from dedoc.readers.html_reader.html_reader import HtmlReader from dedoc.utils import supported_image_types -from dedoc.utils.utils import get_encoding, calculate_file_hash +from dedoc.utils.utils import calculate_file_hash, get_encoding from dedoc.utils.utils import check_filename_length -from dedoc.readers.html_reader.html_reader import HtmlReader class MhtmlReader(BaseReader): @@ -71,23 +72,23 @@ def __extract_files(self, path: str, save_dir: str) -> List[str]: with gzip.open(path, "rt") as f: message = email.message_from_file(f) else: - with open(path, 'r') as f: + with open(path, "r") as f: message = email.message_from_file(f) - self.logger.info('Extracting {}'.format(path)) + self.logger.info(f"Extracting {path}") for part in message.walk(): if part.is_multipart(): continue content_type = part.get("Content-type", "") - content_location = part['Content-Location'] - content_name = os.path.basename(urlparse(content_location).path) or '{}.html'.format(os.path.basename(os.path.splitext(path)[0])) + content_location = part["Content-Location"] + content_name = os.path.basename(urlparse(content_location).path) or f"{os.path.basename(os.path.splitext(path)[0])}.html" if content_type == "text/html" and not content_name.endswith(".html"): content_name += ".html" content_name = check_filename_length(content_name) with tempfile.TemporaryDirectory() as tmpdir: tmp_path = os.path.join(tmpdir, content_name) - with open(tmp_path, 'wb') as fp: + with open(tmp_path, "wb") as fp: fp.write(part.get_payload(decode=True)) file_hash = calculate_file_hash(tmp_path) @@ -123,7 +124,7 @@ def __get_attachments(self, save_dir: str, names_list: List[str], need_content_a continue attachment = AttachedFile(original_name=os.path.basename(file_name), tmp_file_path=os.path.join(save_dir, file_name), - uid="attach_{}".format(uuid.uuid1()), + uid=f"attach_{uuid.uuid1()}", need_content_analysis=need_content_analysis) attachments.append(attachment) return attachments diff --git a/dedoc/readers/note_reader/note_reader.py b/dedoc/readers/note_reader/note_reader.py index 92b73bfa..c39acbf6 100644 --- a/dedoc/readers/note_reader/note_reader.py +++ b/dedoc/readers/note_reader/note_reader.py @@ -3,9 +3,9 @@ import pickle from typing import Optional -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException -from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader @@ -35,9 +35,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio """ try: - with open(path, 'rb') as infile: + with open(path, "rb") as infile: note_dict = pickle.load(infile) - text = note_dict['content'] + text = note_dict["content"] if isinstance(text, bytes): text = text.decode() lines = [LineWithMeta(line=text, annotations=[], metadata=LineMetadata(line_id=0, page_id=0))] @@ -46,4 +46,4 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio return unstructured except Exception as e: self.logger.warning(f"Can't handle {path}\n{e}") - raise BadFileFormatException(f"Bad note file:\n file_name = {os.path.basename(path)}. Seems note-format is broken") + raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(path)}. Seems note-format is broken") diff --git a/dedoc/readers/pdf_reader/data_classes/line_with_location.py b/dedoc/readers/pdf_reader/data_classes/line_with_location.py index 3583abfc..190c1d87 100644 --- a/dedoc/readers/pdf_reader/data_classes/line_with_location.py +++ b/dedoc/readers/pdf_reader/data_classes/line_with_location.py @@ -8,13 +8,7 @@ class LineWithLocation(LineWithMeta): - def __init__(self, - line: str, - metadata: LineMetadata, - annotations: List[Annotation], - location: Location, - uid: str = None, - order: int = -1) -> None: + def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotation], location: Location, uid: str = None, order: int = -1) -> None: self.location = location self.order = order super().__init__(line, metadata, annotations, uid) diff --git a/dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py b/dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py index a6eaa20d..75b70111 100644 --- a/dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py +++ b/dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py @@ -1,4 +1,5 @@ from typing import List + import numpy as np from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment @@ -7,11 +8,7 @@ class PageWithBBox: - def __init__(self, - image: np.ndarray, - bboxes: List[TextWithBBox], - page_num: int, - attachments: List[PdfImageAttachment] = None) -> None: + def __init__(self, image: np.ndarray, bboxes: List[TextWithBBox], page_num: int, attachments: List[PdfImageAttachment] = None) -> None: self.image = image self.bboxes = bboxes self.page_num = page_num diff --git a/dedoc/readers/pdf_reader/data_classes/pdf_image_attachment.py b/dedoc/readers/pdf_reader/data_classes/pdf_image_attachment.py index 8f26b31e..634a361b 100644 --- a/dedoc/readers/pdf_reader/data_classes/pdf_image_attachment.py +++ b/dedoc/readers/pdf_reader/data_classes/pdf_image_attachment.py @@ -4,16 +4,7 @@ class PdfImageAttachment(AttachedFile): - def __init__(self, - original_name: str, - tmp_file_path: str, - need_content_analysis: bool, - uid: str, - location: Location, - order: int = -1) -> None: + def __init__(self, original_name: str, tmp_file_path: str, need_content_analysis: bool, uid: str, location: Location, order: int = -1) -> None: self.location = location self.order = order - super().__init__(original_name=original_name, - tmp_file_path=tmp_file_path, - need_content_analysis=need_content_analysis, - uid=uid) + super().__init__(original_name=original_name, tmp_file_path=tmp_file_path, need_content_analysis=need_content_analysis, uid=uid) diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py index 9308b1da..c4f1ada5 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/cell.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/cell.py @@ -40,7 +40,8 @@ def __init__(self, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None, - contour_coord: BBox = BBox(0, 0, 0, 0)) -> None: + contour_coord: Optional[BBox] = None) -> None: + assert x_top_left <= x_bottom_right assert y_top_left <= y_bottom_right self.x_top_left = x_top_left @@ -49,19 +50,19 @@ def __init__(self, self.y_bottom_right = y_bottom_right self.id_con = id_con if not isinstance(text, str): - raise ValueError("get {} ({}) instead of text".format(text.__class__, text)) + raise ValueError(f"get {text.__class__} ({text}) instead of text") self.text = text self.is_attribute = is_attribute self.is_attribute_required = is_attribute_required self.rotated_angle = rotated_angle - self.cell_uid = "cell_{}".format(uuid.uuid1()) if uid is None else uid + self.cell_uid = f"cell_{uuid.uuid1()}" if uid is None else uid self.colspan = 1 self.rowspan = 1 self.invisible = False - self.con_coord = contour_coord + self.con_coord = contour_coord or BBox(0, 0, 0, 0) def __str__(self) -> str: - return "Cell((cs={}, rs={}, {})".format(self.colspan, self.rowspan, self.text) + return f"Cell((cs={self.colspan}, rs={self.rowspan}, {self.text})" def __repr__(self) -> str: return self.__str__() diff --git a/dedoc/readers/pdf_reader/data_classes/tables/location.py b/dedoc/readers/pdf_reader/data_classes/tables/location.py index d6c25c4b..86ed6d26 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/location.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/location.py @@ -1,7 +1,7 @@ import math from collections import OrderedDict from functools import total_ordering -from typing import Tuple, Dict, Any +from typing import Any, Dict, Tuple from dedoc.data_structures.bbox import BBox diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py index d90b30b8..a9df84cf 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py @@ -1,6 +1,7 @@ import copy from collections import OrderedDict -from typing import List, Any +from typing import Any, List + import numpy as np from dedoc.data_structures.bbox import BBox @@ -9,12 +10,7 @@ class ScanTable: - def __init__(self, - page_number: int, - matrix_cells: List[List[Cell]] = None, - bbox: BBox = None, - name: str = "", - order: int = -1) -> None: + def __init__(self, page_number: int, matrix_cells: List[List[Cell]] = None, bbox: BBox = None, name: str = "", order: int = -1) -> None: self.matrix_cells = matrix_cells self.page_number = page_number self.locations = [] @@ -40,7 +36,7 @@ def get_cells_text(attr_cells: List[List[Cell]]) -> List[List[str]]: return attrs @staticmethod - def get_key_value_attrs(attrs: List, val: Any) -> dict: + def get_key_value_attrs(attrs: List, val: Any) -> dict: # noqa res_attrs = [] for i in range(0, len(attrs)): res_attrs.append({"attr": attrs[i]}) diff --git a/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py b/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py index 6abfe741..55c98c0d 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/table_tree.py @@ -1,13 +1,14 @@ import logging from collections import namedtuple from typing import List + import cv2 from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_cell_extractor import OCRCellExtractor logger = logging.getLogger("TableRecognizer.TableTree") -'''-------------------------------Таблица в виде дерева, полученная от OpenCV----------------------------------------''' +"""-------------------------------Таблица в виде дерева, полученная от OpenCV----------------------------------------""" ContourCell = namedtuple("ContourCell", ["id_con", "image"]) @@ -47,7 +48,7 @@ def set_text_into_tree(tree: "TableTree", img_cell = [pair.image for i, pair in enumerate(cell_images) if pair.id_con == tree.id_contours][0] trees.append((tree, img_cell)) if tree.config.get("debug_mode", False): - config.get("logger", logging.getLogger()).debug("{} : text : {}".format(tree.id_contours, tree.text)) + config.get("logger", logging.getLogger()).debug(f"{tree.id_contours} : text : {tree.text}") for ch in tree.children: stack.append((ch, cur_depth + 1, begin_depth, end_depth)) # texts = [get_cell_text_by_ocr(image, language=language) for _, image in trees] @@ -58,7 +59,7 @@ def set_text_into_tree(tree: "TableTree", else: texts = cell_extractor.get_cells_text(img_cells=images, language=language) assert len(trees) == len(texts) - for text, (tree, img_cell) in zip(texts, trees): + for text, (tree, _) in zip(texts, trees): tree.text = text @staticmethod @@ -78,13 +79,9 @@ def print_tree(self, depth: int) -> None: if not self.data_bb or not self.id_contours: return - indent = ''.join(['\t' for _ in range(depth)]) - self.logger.debug("{}{} : coord: {}, {}, {}, {}".format(indent, - self.id_contours, - self.data_bb[0], - self.data_bb[1], - self.data_bb[0] + self.data_bb[2], - self.data_bb[1] + self.data_bb[3])) + indent = "".join(["\t" for _ in range(depth)]) + self.logger.debug(f"{indent}{self.id_contours} : coord: {self.data_bb[0]}, {self.data_bb[1]}, {self.data_bb[0] + self.data_bb[2]}, " + f"{self.data_bb[1] + self.data_bb[3]}") for ch in self.children: ch.print_tree(depth + 1) @@ -96,7 +93,7 @@ def __build_childs(self, cur: "TableTree", hierarchy: List, contours: List) -> " # Эвристика №1 на ячейку if bounding_box[2] < self.config["min_w_cell"] or bounding_box[3] < self.config["min_h_cell"]: if self.config.get("debug_mode", False): - self.logger.debug("Contour {} isn't correct".format(i)) + self.logger.debug(f"Contour {i} isn't correct") continue t = TableTree(config=self.config) t.id_contours = i diff --git a/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py b/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py index 3de4db30..30cb50d2 100644 --- a/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py +++ b/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py @@ -1,11 +1,11 @@ from collections import OrderedDict -from typing import Optional, List +from typing import List, Optional from uuid import uuid1 from dedoc.data_structures.annotation import Annotation -from dedoc.data_structures.serializable import Serializable -from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation from dedoc.data_structures.bbox import BBox +from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation +from dedoc.data_structures.serializable import Serializable class TextWithBBox(Serializable): @@ -26,10 +26,10 @@ def __init__(self, self.annotations = [] if annotations is None else annotations if BBoxAnnotation.name not in [annotation.name for annotation in self.annotations]: self.annotations.append(BBoxAnnotation(start=0, end=len(text), value=bbox)) - self.uid = "bbox_{}".format(uuid1()) if uid is None else uid + self.uid = f"bbox_{uuid1()}" if uid is None else uid def __str__(self) -> str: - return "TextWithBBox(bbox = {}, page = {}, text = {})".format(self.bbox, self.page_num, self.text) + return f"TextWithBBox(bbox = {self.bbox}, page = {self.page_num}, text = {self.text})" def __repr__(self) -> str: return self.__str__() diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/catboost_model_extractor.py b/dedoc/readers/pdf_reader/pdf_auto_reader/catboost_model_extractor.py new file mode 100644 index 00000000..1ef14c04 --- /dev/null +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/catboost_model_extractor.py @@ -0,0 +1,79 @@ +import gzip +import logging +import os +import pickle +from typing import List +import catboost.core +from dedoc.download_models import download_from_hub + +from dedoc.config import get_config +from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox + + +class CatboostModelExtractor: + """ + The CatboostModelExtractor class is used for detecting the correctness of the text layer in a PDF document + using a CatBoost model. + """ + def __init__(self, *, config: dict) -> None: + self.config = config + self.logger = config.get("logger", logging.getLogger()) + eng = list(map(chr, range(ord('a'), ord('z') + 1))) + + rus = [chr(i) for i in range(ord('а'), ord('а') + 32)] + rus.append("ё") + + digits = [str(i) for i in range(10)] + special_symbols = [i for i in "<>~!@#$%^&*_+-/\"|?.,:;'`= "] + brackets = [i for i in "{}[]()"] + self.list_letters = eng + [i.upper() for i in eng] + rus + [i.upper() for i in rus] + self.list_symbols = digits + special_symbols + brackets + + self.path = os.path.join(get_config()["resources_path"], "catboost_detect_tl_correctness.pkl.gz") + self.__model = None + + @property + def __get_model(self) -> catboost.core.CatBoostClassifier: + if self.__model is not None: + return self.__model + + if not os.path.isfile(self.path): + out_dir, out_name = os.path.split(self.path) + download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="catboost_detect_tl_correctness", hub_name="model.pkl.gz") + + assert os.path.isfile(self.path) + with gzip.open(self.path, 'rb') as f: + self.__model = pickle.load(f) + + return self.__model + + def detect_text_layer_correctness(self, text_layer_bboxes: List[TextWithBBox]) -> bool: + """ + Detect the correctness of the text layer in a PDF document. + :param text_layer_bboxes: List of text lines with bounding boxes. + :returns: True if the text layer is correct, False otherwise. + """ + text_layer = u"".join([pdf_line.text for pdf_line in text_layer_bboxes]) + if not text_layer: + return False + + features = self.__get_feature_for_predict(text_layer) + return True if self.__get_model.predict(features) == 1 else False + + def __get_feature_for_predict(self, text: str) -> List[float]: + list_of_sub = [] + num_letters_in_data = self._count_letters(text) + num_other_symbol_in_data = self._count_other(text) + for symbol in self.list_letters: + # proportion of occurring english and russian letters + list_of_sub.append(round(text.count(symbol) / num_letters_in_data, 5) if num_letters_in_data != 0 else 0.0) + for symbol in self.list_symbols: + list_of_sub.append(text.count(symbol)) + list_of_sub.append((num_letters_in_data + num_other_symbol_in_data) / len(text) if len(text) != 0 else 0) + return list_of_sub + + def _count_letters(self, text: str) -> int: + return sum(1 for symbol in text if symbol in self.list_letters) + + def _count_other(self, text: str) -> int: + return sum(1 for symbol in text if symbol in self.list_symbols) diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py index 3de341c4..a370c33c 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py @@ -13,7 +13,7 @@ from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader -from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer, get_param_page_slice +from dedoc.utils.parameter_utils import get_param_page_slice, get_param_pdf_with_txt_layer class PdfAutoReader(BaseReader): @@ -23,7 +23,8 @@ class PdfAutoReader(BaseReader): :class:`~dedoc.readers.PdfAutoReader` is used for automatic detection of a correct textual layer in the given PDF file: - * if PDF document has a correct textual layer then :class:`~dedoc.readers.PdfTxtLayerReader` or :class:`~dedoc.readers.PdfTabbyReader` is used for document content extraction; + * if PDF document has a correct textual layer then :class:`~dedoc.readers.PdfTxtLayerReader` or :class:`~dedoc.readers.PdfTabbyReader` is used \ + for document content extraction; * if PDF document doesn't have a correct textual layer then :class:`~dedoc.readers.PdfImageReader` is used for document content extraction. @@ -86,11 +87,7 @@ def __handle_incorrect_text_layer(self, parameters_copy: dict, path: str, warnin result = self.pdf_image_reader.read(path=path, parameters=parameters_copy) return result - def __handle_correct_text_layer(self, - is_first_page_correct: bool, - parameters: dict, - path: str, - warnings: list) -> UnstructuredDocument: + def __handle_correct_text_layer(self, is_first_page_correct: bool, parameters: dict, path: str, warnings: list) -> UnstructuredDocument: self.logger.info(f"Assume document {os.path.basename(path)} has a correct textual layer") warnings.append("Assume document has a correct textual layer") recognized_first_page = None @@ -152,7 +149,4 @@ def __merge_documents(self, first: UnstructuredDocument, second: UnstructuredDoc if not (isinstance(annotation, TableAnnotation) and annotation.value in dropped_tables)] new_line = LineWithMeta(line=line.line, metadata=line.metadata, annotations=annotations, uid=line.uid) lines.append(new_line) - return UnstructuredDocument(tables=tables, - lines=lines, - attachments=first.attachments + second.attachments, - metadata=second.metadata) + return UnstructuredDocument(tables=tables, lines=lines, attachments=first.attachments + second.attachments, metadata=second.metadata) diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_txtlayer_correctness.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_txtlayer_correctness.py new file mode 100644 index 00000000..b83ed1ef --- /dev/null +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_txtlayer_correctness.py @@ -0,0 +1,213 @@ +import logging +from collections import namedtuple +from typing import List +import cv2 +import numpy as np +from pdf2image import convert_from_path +from pdf2image.exceptions import PDFPageCountError + +import dedoc.utils.parameter_utils as param_utils +from dedoc.readers.pdf_reader.pdf_auto_reader.catboost_model_extractor import CatboostModelExtractor +from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_txtlayer_parameters import PdfTxtlayerParameters +from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox +from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_document_page +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.extractor_pdf_textlayer import ExtractorPdfTextLayer +from dedoc.utils.pdf_utils import get_pdf_page_count +from dedoc.utils.utils import similarity_levenshtein + + +class PdfTextLayerCorrectness: + + def __init__(self, *, config: dict) -> None: + self.config = config + self.logger = config.get("logger", logging.getLogger()) + self.check_page_num = 5 + self.pdf_page_text_layer_param = namedtuple('Param', 'page_num_with_max_text_size have_text text_layer_bboxes') + self.catboost_model_extractor = CatboostModelExtractor(config=config) + self.logger = config.get("logger", logging.getLogger()) + + def with_text_layer(self, path: str, parameters: dict, is_one_column_list: List[bool]) -> PdfTxtlayerParameters: + """ + Have PDF text layer or not? Also, classify documents onto booklets or not booklets + :param path: path to PDF file + :param parameters: parameters for classifier + :return: PdfParameter information about PDF text layer and if the document is a booklet + """ + threshold_similarity = self.config.get("threshold_similarity", 0.5) + # get the first image from PDF + try: + page_count = get_pdf_page_count(path) + image, page_number, page_count = self._get_image_from_first_page(path=path, page_count=page_count) + is_booklet = self.__is_booklet(image) + lang = param_utils.get_param_language(parameters) + pdf_page_text_layer_param = \ + self._get_page_num_and_have_text_flag_from_text_layer(path=path, + is_one_column_list=is_one_column_list, + page_count=page_count) + if pdf_page_text_layer_param.have_text: + return self._detect_text_layer(path=path, + pdf_page_text_layer_param=pdf_page_text_layer_param, + is_one_column_list=is_one_column_list, + is_booklet=is_booklet, lang=lang, + threshold_similarity=threshold_similarity) + else: + return PdfTxtlayerParameters(False, False, is_booklet) + except PDFPageCountError: + return PdfTxtlayerParameters(False, False, False) + + @staticmethod + def __is_booklet(image: np.ndarray) -> bool: + """ + The booklet is a colorful document with complex background. Booklet required special handling, so we have to + classify each document like a booklet or not. + :param image: Image of the document page in RGB format. + :return: True if the document is a booklet, False otherwise. + """ + # convert image from RGB to HSV (https://en.wikipedia.org/wiki/HSL_and_HSV) + # In that space, booklets are well separate from the ordinary documents + image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV) + height, width, channels = image.shape + # Reshape into flat array of points and calculate mean color + flat_array = image.reshape(height * width, channels) + hue, saturation, value = flat_array.mean(axis=0) + return hue > 30 or value < 190 or saturation > 160 + + def __extract_text_by_ocr(self, image: np.ndarray, lang: str, page_num: int) -> List[TextWithBBox]: + """ + recognition text by ocr-tools from the PDF first page + :param image: image of the first page of PDF + :param lang: language of text + :return: extracted text-words with bboxes + """ + ocr_text_bboxes = [] + + output_dict = get_text_with_bbox_from_document_page(image, + language=lang, + ocr_conf_thr=self.config.get("ocr_conf_threshold", -1)) + for line_num, line in enumerate(output_dict.lines): + ocr_text_bboxes.append(TextWithBBox(bbox=line.bbox, text=line.text, page_num=page_num, line_num=line_num)) + + return ocr_text_bboxes + + def __mean_similarities_ocr_and_text(self, text_layer_bboxes: List[TextWithBBox], + ocr_bboxes: List[TextWithBBox]) -> float: + """ + :param text_layer_bboxes: text from text-layer of pdf + :param ocr_bboxes: recognized text with help ocr + :return: average similarity of Layer texts and recognized texts + """ + text_layer = "".join([pdf_line.text for pdf_line in text_layer_bboxes]) + text_ocr = "".join([ocr_word.text for ocr_word in ocr_bboxes]) + similarity = similarity_levenshtein(text_layer, text_ocr) + + if self.config.get("debug_mode", False): + self.config.get("logger", logging.getLogger()).debug("AVG SIMILARITY = {}".format(similarity)) + return similarity + + def __extract_text_layer_from_pdf(self, + path: str, + page_number: int = 0, + is_one_column_document: bool = False) -> List[TextWithBBox]: + """ + extraction text-layer from the PDF first page + :param path: path to PDF + :return: extracted text-lines with bboxes + """ + page = ExtractorPdfTextLayer(config=self.config). \ + extract_text_layer(path=path, + page_number=page_number, + is_one_column_document=is_one_column_document) + + return page.bboxes + + def _get_page_num_and_have_text_flag_from_text_layer(self, path: str, is_one_column_list: List[bool], + page_count: int) -> namedtuple: + have_text = True + max_text_layer_bboxes = List[TextWithBBox] + try: + page_count = self.check_page_num if page_count >= self.check_page_num else page_count + page_with_max_count_symbol = 0 + symbol_count = 0 + max_symbol_count = 0 + for page_num in range(page_count): + try: + text_layer_bboxes = \ + self.__extract_text_layer_from_pdf(path=path, + page_number=page_num, + is_one_column_document=is_one_column_list[page_num]) + + for pdf_line in text_layer_bboxes: + symbol_count += len(pdf_line.text) + if max_symbol_count < symbol_count: + max_symbol_count = symbol_count + page_with_max_count_symbol = page_num + max_text_layer_bboxes = text_layer_bboxes + symbol_count = 0 + except Exception as exception: + self.logger.warning("Can't get text from {}, get error {}. Seems that text layer is broken".format( + path, exception + )) + if self.config.get("debug_mode", False): + raise exception + if max_symbol_count == 0: + have_text = False + return self.pdf_page_text_layer_param(page_with_max_count_symbol, have_text, max_text_layer_bboxes) + except PDFPageCountError: + return self.pdf_page_text_layer_param(0, have_text, max_text_layer_bboxes) + + def _get_image_from_first_page(self, path: str, page_count: int) -> tuple: + if page_count is None: + page_count = 0 + page_number = 1 if page_count > 1 else 0 + image = convert_from_path(path, first_page=page_number + 1, last_page=page_number + 1)[0] + image = np.array(image) + return image, page_number, page_count + + def _is_txt_layer_correct(self, path: str, lang: str, page_number: int, text_layer_bboxes: List[TextWithBBox], + threshold_similarity: float) -> bool: + image = convert_from_path(path, first_page=page_number + 1, last_page=page_number + 1)[0] + image = np.array(image) + ocr_bboxes = self.__extract_text_by_ocr(image=image, lang=lang, page_num=page_number) + mean_similarity = self.__mean_similarities_ocr_and_text(text_layer_bboxes, ocr_bboxes) + is_txt_layer_correct = mean_similarity > threshold_similarity + return is_txt_layer_correct + + def _is_first_page_correct(self, path: str, is_one_column: bool, is_txt_layer_correct: bool) -> bool: + if is_txt_layer_correct: + bboxes_first_page = [line for line in + self.__extract_text_layer_from_pdf(path=path, + page_number=0, + is_one_column_document=is_one_column) + if + len(line.text.strip()) > 0] + is_first_page_correct = len(bboxes_first_page) > 0 + else: + is_first_page_correct = False + return is_first_page_correct + + def _detect_text_layer(self, path: str, pdf_page_text_layer_param: namedtuple, is_one_column_list: List[bool], + is_booklet: bool, + lang: str, threshold_similarity: float) -> PdfTxtlayerParameters: + if self.catboost_model_extractor.detect_text_layer_correctness(text_layer_bboxes=pdf_page_text_layer_param.text_layer_bboxes): + message = "assume document has almost correct text layer" + self.logger.debug(message) + is_txt_layer_correct = \ + self._is_txt_layer_correct(path=path, + lang=lang, + page_number=pdf_page_text_layer_param.page_num_with_max_text_size, + text_layer_bboxes=pdf_page_text_layer_param.text_layer_bboxes, + threshold_similarity=threshold_similarity) + if is_txt_layer_correct: + message = "assume document has correct text layer" + self.logger.debug(message) + else: + message = "assume document has incorrect text layer" + self.logger.debug(message) + else: + message = "assume document has catboost incorrect text layer" + self.logger.debug(message) + is_txt_layer_correct = False + is_first_page_correct = self._is_first_page_correct(path=path, + is_one_column=is_one_column_list[0], + is_txt_layer_correct=is_txt_layer_correct) + return PdfTxtlayerParameters(is_txt_layer_correct, is_first_page_correct, is_booklet) diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_txtlayer_parameters.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_txtlayer_parameters.py new file mode 100644 index 00000000..ccf2cd09 --- /dev/null +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_txtlayer_parameters.py @@ -0,0 +1,7 @@ +class PdfTxtlayerParameters: + + def __init__(self, correct_text_layout: bool, correct_first_page: bool, is_booklet: bool) -> None: + super().__init__() + self.correct_text_layout = correct_text_layout + self.correct_first_page = correct_first_page + self.is_booklet = is_booklet diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py index fe35ac03..df5ee592 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py @@ -34,7 +34,7 @@ def __get_model(self) -> XGBClassifier: download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="txtlayer_classifier", hub_name="model.pkl.gz") assert os.path.isfile(self.path) - with gzip.open(self.path, 'rb') as f: + with gzip.open(self.path, "rb") as f: self.__model = pickle.load(f) return self.__model diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py index 34872ea1..bbdf6f3e 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py @@ -4,9 +4,9 @@ from typing import List from dedoc.data_structures import LineWithMeta -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier import TxtlayerClassifier +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader from dedoc.utils.pdf_utils import get_pdf_page_count PdfTxtlayerParameters = namedtuple("PdfTxtlayerParameters", ["is_correct_text_layer", "is_first_page_correct"]) diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_feature_extractor.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_feature_extractor.py index 33164c01..8ee85507 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_feature_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_feature_extractor.py @@ -8,8 +8,8 @@ class TxtlayerFeatureExtractor: def __init__(self) -> None: - eng = "".join(list(map(chr, range(ord('a'), ord('z') + 1)))) - rus = "".join([chr(i) for i in range(ord('а'), ord('а') + 32)] + ["ё"]) + eng = "".join(list(map(chr, range(ord("a"), ord("z") + 1)))) + rus = "".join([chr(i) for i in range(ord("а"), ord("а") + 32)] + ["ё"]) self.lower_letters = eng + rus self.upper_letters = self.lower_letters.upper() diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index e63222af..43dea82d 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -3,7 +3,7 @@ import os from abc import abstractmethod from collections import namedtuple -from typing import List, Optional, Tuple, Iterator +from typing import Iterator, List, Optional, Tuple import cv2 import numpy as np @@ -13,12 +13,12 @@ import dedoc.utils.parameter_utils as param_utils from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException +from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.table import Table from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.extensions import recognized_mimes, recognized_extensions +from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment @@ -44,7 +44,7 @@ "first_page", "last_page", "need_binarization", - 'table_type', + "table_type", "is_one_column_document_list"]) @@ -166,10 +166,10 @@ def _get_images(self, path: str, page_from: int, page_to: int) -> Iterator[np.nd elif mime in recognized_mimes.image_like_format or path.endswith(tuple(recognized_extensions.image_like_format)): image = cv2.imread(path) if image is None: - raise BadFileFormatException("seems file {} not an image".format(os.path.basename(path))) + raise BadFileFormatError(f"seems file {os.path.basename(path)} not an image") yield image else: - raise BadFileFormatException("Unsupported input format: {}".format(splitext_(path)[1])) # noqa + raise BadFileFormatError(f"Unsupported input format: {splitext_(path)[1]}") def _split_pdf2image(self, path: str, page_from: int, page_to: int) -> Iterator[np.ndarray]: if page_from >= page_to: @@ -186,14 +186,14 @@ def _split_pdf2image(self, path: str, page_from: int, page_to: int) -> Iterator[ # for convert_from_path function first_page should start from 1, last_page is included to the result images = convert_from_path(path, first_page=left, last_page=right) # noqa # in logging we include both ends of the pages interval, numeration starts with 1 - self.logger.info("Get page from {} to {} of {} file {}".format(left, min(right, page_count), page_count, os.path.basename(path))) + self.logger.info(f"Get page from {left} to {min(right, page_count)} of {page_count} file {os.path.basename(path)}") for image in images: left += 1 if left > page_to + 1: break yield np.array(image) except (PDFPageCountError, PDFSyntaxError) as error: - raise BadFileFormatException(f"Bad pdf file:\n file_name = {os.path.basename(path)} \n exception = {error.args}") + raise BadFileFormatError(f"Bad pdf file:\n file_name = {os.path.basename(path)} \n exception = {error.args}") def _convert_to_gray(self, image: np.ndarray) -> np.ndarray: gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY) @@ -214,8 +214,6 @@ def eval_tables_by_batch(self, orient_cell_angle: int = 270, table_type: str = "") -> Tuple[List[np.ndarray], List[ScanTable]]: - result_batch = Parallel(n_jobs=self.config["n_jobs"])( - delayed(self.table_recognizer.recognize_tables_from_image)( - image, page_number_begin + i, language, orient_analysis_cells, orient_cell_angle, table_type) - for i, image in enumerate(batch)) # noqa + result_batch = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.table_recognizer.recognize_tables_from_image)( + image, page_number_begin + i, language, orient_analysis_cells, orient_cell_angle, table_type) for i, image in enumerate(batch)) return result_batch diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/adaptive_binarizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/adaptive_binarizer.py index 8c6d2113..820c517d 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/adaptive_binarizer.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/adaptive_binarizer.py @@ -1,4 +1,5 @@ from typing import Tuple + import cv2 import numpy as np diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/columns_orientation_classifier.py b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/columns_orientation_classifier.py index df5e4ca2..8b5e028f 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/columns_orientation_classifier.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/columns_orientation_classifier.py @@ -1,19 +1,20 @@ import logging import warnings -from typing import Tuple, Optional +from os import path +from typing import Optional, Tuple + +import cv2 import numpy as np import torch from PIL import Image from torchvision import transforms from torchvision.transforms.functional import resize -import cv2 -from os import path +from dedoc.config import get_config from dedoc.download_models import download_from_hub from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.model import ClassificationModelTorch from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import \ __detect_horizontal_and_vertical_lines as detect_horizontal_and_vertical_lines -from dedoc.config import get_config class ColumnsOrientationClassifier(object): @@ -62,7 +63,7 @@ def _set_device(self, on_gpu: bool) -> None: self.location = lambda storage, loc: storage.cuda() else: self.device = torch.device("cpu") - self.location = 'cpu' + self.location = "cpu" def _load_weights(self, net: ClassificationModelTorch) -> None: path_checkpoint = path.join(self.checkpoint_path, "scan_orientation_efficient_net_b0.pth") @@ -75,11 +76,11 @@ def _load_weights(self, net: ClassificationModelTorch) -> None: with warnings.catch_warnings(): warnings.simplefilter("ignore") net.load_state_dict(torch.load(path_checkpoint, map_location=self.location)) - self.logger.info('Weights were loaded from {}'.format(path_checkpoint)) + self.logger.info(f"Weights were loaded from {path_checkpoint}") def save_weights(self, path_checkpoint: str) -> None: torch.save(self.net.state_dict(), path_checkpoint) - self.logger.info('Weights were saved into {}'.format(path_checkpoint)) + self.logger.info(f"Weights were saved into {path_checkpoint}") def _set_transform_image(self) -> None: """ @@ -112,7 +113,7 @@ def get_features(self, image: np.array) -> torch.Tensor: Get features for image with horizontal and vertical lines """ image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB) - pil_image = Image.fromarray(np.uint8(image)).convert('RGB') + pil_image = Image.fromarray(np.uint8(image)).convert("RGB") tensor_image = self.transform(pil_image).unsqueeze(0).float().to(self.device) return tensor_image diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py index 895d4041..80442aee 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py @@ -1,10 +1,12 @@ +import os from typing import Callable, Dict -from torch.utils.data import Dataset, DataLoader -from torchvision import transforms -import torch + import pandas as pd -import os +import torch from skimage import io +from torch.utils.data import DataLoader, Dataset +from torchvision import transforms + from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.transforms import TransformWithLabels @@ -18,8 +20,7 @@ def __init__(self, csv_file: str, root_dir: str, transform: Callable = None) -> Args: csv_file (string): Path to the csv file with annotations. root_dir (string): Directory with all the images. - transform (callable, optional): Optional transform to be applied - on a sample. + transform (callable, optional): Optional transform to be applied on a sample. """ self.label_loader = pd.read_csv(csv_file) self.root_dir = root_dir @@ -36,11 +37,11 @@ def __getitem__(self, idx: torch.Tensor) -> Dict[str, str]: self.label_loader.iloc[idx, 0]) image = io.imread(img_name) label = self.label_loader.iloc[idx, 1:] - orientation = label['orientation'] - orientation = orientation.astype('int') - columns = label['columns'] - columns = columns.astype('int') - sample = {'image': image, 'orientation': orientation, 'columns': columns, 'image_name': img_name} + orientation = label["orientation"] + orientation = orientation.astype("int") + columns = label["columns"] + columns = columns.astype("int") + sample = {"image": image, "orientation": orientation, "columns": columns, "image_name": img_name} if self.transform: sample = self.transform(sample) @@ -54,7 +55,7 @@ class DataLoaderImageOrient(Dataset): """ def __init__(self) -> None: self.transform = transforms.Compose([TransformWithLabels()]) - self.classes = ('1', '2', '0', '90', '180', '270') + self.classes = ("1", "2", "0", "90", "180", "270") def load_dataset(self, csv_path: str, image_path: str, batch_size: int = 4) -> DataLoader: trainset = DatasetImageOrient(csv_file=csv_path, root_dir=image_path, transform=self.transform) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/model.py b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/model.py index afd4861a..90681cdc 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/model.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/model.py @@ -1,4 +1,5 @@ from typing import Optional + import torch from torch import nn from torchvision import models diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/transforms.py b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/transforms.py index ba802074..14b88b30 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/transforms.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/transforms.py @@ -1,7 +1,8 @@ -from typing import Dict, Any +from typing import Any, Dict + import numpy as np -from torchvision import transforms from PIL import Image +from torchvision import transforms from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import \ ColumnsOrientationClassifier @@ -17,12 +18,11 @@ def __init__(self) -> None: transforms.Lambda(ColumnsOrientationClassifier.my_resize), transforms.CenterCrop(1200), transforms.ToTensor(), - transforms.Normalize( - mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) ]) def __call__(self, image: np.ndarray) -> Image: - pil_image = Image.fromarray(np.uint8(image)).convert('RGB') + pil_image = Image.fromarray(np.uint8(image)).convert("RGB") image = self.transform(pil_image) return image @@ -37,13 +37,12 @@ def __init__(self) -> None: transforms.Lambda(ColumnsOrientationClassifier.my_resize), transforms.CenterCrop(1200), transforms.ToTensor(), - transforms.Normalize( - mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) ]) def __call__(self, sample: dict) -> Dict[str, Any]: - image, orientation, columns = sample['image'], sample['orientation'], sample['columns'] - pil_image = Image.fromarray(np.uint8(image)).convert('RGB') + image, orientation, columns = sample["image"], sample["orientation"], sample["columns"] + pil_image = Image.fromarray(np.uint8(image)).convert("RGB") image = self.transform(pil_image) - return {'image': image, 'orientation': orientation, 'columns': columns, 'image_name': sample['image_name']} + return {"image": image, "orientation": orientation, "columns": columns, "image_name": sample["image_name"]} diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/line_metadata_extractor/metadata_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/line_metadata_extractor/metadata_extractor.py index 504e25aa..a6118c27 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/line_metadata_extractor/metadata_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/line_metadata_extractor/metadata_extractor.py @@ -29,7 +29,7 @@ def predict_annotations(self, page_with_lines: PageWithBBox) -> PageWithBBox: for bbox in page_with_fonts.bboxes: font_size = self.__get_font_size(bbox, image_height) - bbox.annotations.append(SizeAnnotation(start=0, end=len(bbox.text), value="{}".format(font_size))) + bbox.annotations.append(SizeAnnotation(start=0, end=len(bbox.text), value=str(font_size))) return page_with_fonts @@ -47,7 +47,7 @@ def extract_metadata_and_set_annotations(self, page_with_lines: PageWithBBox, ca self.predict_annotations(page_with_lines) lines = [] - for bbox_id, bbox in enumerate(page_with_lines.bboxes): + for bbox in page_with_lines.bboxes: lines.append(self.get_line_with_meta(bbox=bbox)) if page_with_lines.image.ndim == 3 and page_with_lines.image.shape[2] == 3: color_annotation = self.__get_color_annotation(bbox, page_with_lines.image) @@ -69,11 +69,11 @@ def get_line_with_meta(self, bbox: TextWithBBox) -> LineWithLocation: def convert_pixels_into_indentation(indentation_width: int, image_width: int) -> int: # ref from http://officeopenxml.com/WPindentation.php # Values are in twentieths of a point: 1440 twips = 1 inch; 567 twips = 1 centimeter. - INDENTATION_PER_CM = 567 + indentation_per_cm = 567 pixel2mm = 297 / image_width # 297 mm it is height of A4 paper indentation_mm = indentation_width * pixel2mm - indentation = int(indentation_mm / 10 * INDENTATION_PER_CM) + indentation = int(indentation_mm / 10 * indentation_per_cm) return indentation @@ -100,20 +100,20 @@ def __get_left_bound_of_text(self, page: PageWithBBox) -> Optional[int]: def __set_indentations(self, page: PageWithBBox) -> PageWithBBox: image_height, image_width, *_ = page.image.shape - SPACES_FOR_TAB = " " + spaces_for_tab = " " # TODO turn off for multicolumn pages (when we write columns-classifier). While turn on for all layout type. left_bound = self.__get_left_bound_of_text(page) if not left_bound: return page - for num, text_with_bbox in enumerate(page.bboxes): + for text_with_bbox in page.bboxes: indentation_text = re.findall("^[ \t]+", text_with_bbox.text) width_space_indentation = 0 width_per_char = text_with_bbox.bbox.width / len(text_with_bbox.text) if indentation_text: - indentation_text = indentation_text[0].replace('\t', SPACES_FOR_TAB) + indentation_text = indentation_text[0].replace("\t", spaces_for_tab) width_space_indentation = len(indentation_text) * width_per_char indentation_width = (text_with_bbox.bbox.x_top_left - left_bound) + width_space_indentation @@ -122,9 +122,7 @@ def __set_indentations(self, page: PageWithBBox) -> PageWithBBox: continue indentation = self.convert_pixels_into_indentation(indentation_width, image_width) - text_with_bbox.annotations.append(IndentationAnnotation(start=0, - end=len(text_with_bbox.text), - value=str(indentation))) + text_with_bbox.annotations.append(IndentationAnnotation(start=0, end=len(text_with_bbox.text), value=str(indentation))) return page @@ -154,9 +152,9 @@ def __add_spacing_annotations(self, lines: List[LineWithLocation]) -> None: median_bbox_size = median([line.location.bbox.height for line in lines]) prev_line = None for line in lines: - if (prev_line is None or - prev_line.location.page_number != line.location.page_number or - prev_line.location.bbox.y_bottom_right >= line.location.bbox.y_top_left): + if prev_line is None or \ + prev_line.location.page_number != line.location.page_number or \ + prev_line.location.bbox.y_bottom_right >= line.location.bbox.y_top_left: space = self.default_spacing else: space = (line.location.bbox.y_top_left - prev_line.location.bbox.y_bottom_right) @@ -172,9 +170,7 @@ def __get_color_annotation(self, bbox_with_text: TextWithBBox, image: np.ndarray image_slice = image[bbox.y_top_left: bbox.y_bottom_right, bbox.x_top_left: bbox.x_bottom_right, :] threshold = 245 - not_white = ((image_slice[:, :, 0] < threshold) & - (image_slice[:, :, 1] < threshold) & - (image_slice[:, :, 2] < threshold)) + not_white = (image_slice[:, :, 0] < threshold) & (image_slice[:, :, 1] < threshold) & (image_slice[:, :, 2] < threshold) if not_white.sum() > 0: red, green, blue = [image_slice[not_white, i].mean() for i in range(3)] else: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py index 46027b39..d6133205 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_cell_extractor.py @@ -1,6 +1,7 @@ import logging import os -from typing import List, Tuple, Iterator, Optional +from typing import Iterator, List, Optional, Tuple + import numpy as np from PIL import Image @@ -29,10 +30,10 @@ def get_cells_text(self, img_cells: List[np.ndarray], language: str) -> List[str if self.config.get("debug_mode", False): tmp_dir = "/tmp/docreader/debug_tables/batches/" os.makedirs(tmp_dir, exist_ok=True) - tmp_dir = os.path.join(tmp_dir, "{}".format(len(os.listdir(tmp_dir)))) + tmp_dir = os.path.join(tmp_dir, f"{len(os.listdir(tmp_dir))}") os.makedirs(tmp_dir, exist_ok=True) for i, image in enumerate(batch): - image.save(os.path.join(tmp_dir, "image_{}.png".format(i))) + image.save(os.path.join(tmp_dir, f"image_{i}.png")) res.extend(self.__handle_one_batch(batch, language)) assert len(res) == len(img_cells) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py index 6ee53e43..3d10cf89 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py @@ -1,14 +1,14 @@ import concurrent.futures from collections import namedtuple -from typing import List, Iterator, Iterable +from typing import Iterable, Iterator, List + import numpy as np from dedoc.data_structures.bbox import BBox from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox -from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import \ - get_text_with_bbox_from_document_page_one_column, \ - get_text_with_bbox_from_cells, get_text_with_bbox_from_document_page +from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_cells, get_text_with_bbox_from_document_page, \ + get_text_with_bbox_from_document_page_one_column BBoxLevel = namedtuple("BBoxLevel", ["text_line", "some_word"]) bbox_level = BBoxLevel(4, 5) @@ -25,8 +25,7 @@ def split_image2lines(self, language: str = "rus+eng", is_one_column_document: bool = True, cells: bool = False) -> PageWithBBox: - bboxes = self.__split_image2bboxes(image=image, page_num=page_num, language=language, - is_one_column_document=is_one_column_document, cells=cells) + bboxes = self.__split_image2bboxes(image=image, page_num=page_num, language=language, is_one_column_document=is_one_column_document, cells=cells) filtered_bboxes = list(self._filtered_bboxes(bboxes)) if len(filtered_bboxes) >= 0: @@ -52,17 +51,11 @@ def _is_box_in(box1: BBox, box2: BBox) -> bool: """ check if box1 is in box2 """ - return ((box1.x_top_left >= box2.x_top_left) and - (box1.y_top_left >= box2.y_top_left) and - (box1.x_bottom_right <= box2.x_bottom_right) and - (box1.y_bottom_right <= box2.y_bottom_right)) + x_inside = (box1.x_top_left >= box2.x_top_left) and (box1.x_bottom_right <= box2.x_bottom_right) + y_inside = (box1.y_top_left >= box2.y_top_left) and (box1.y_bottom_right <= box2.y_bottom_right) + return x_inside and y_inside - def __split_image2bboxes(self, - image: np.ndarray, - page_num: int, - language: str, - is_one_column_document: bool, - cells: bool = False) -> List[TextWithBBox]: + def __split_image2bboxes(self, image: np.ndarray, page_num: int, language: str, is_one_column_document: bool, cells: bool = False) -> List[TextWithBBox]: ocr_conf_thr = self.config.get("ocr_conf_threshold", -1) if not cells: if is_one_column_document: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_block.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_block.py index 7185e21a..9a61a25f 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_block.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_block.py @@ -29,6 +29,6 @@ def from_list(elements: List[OcrElement], ocr_conf_thr: float) -> "OcrBlock": elif element.level == OcrBlock.level: head = element else: - raise ValueError("Some element {} has level greater than this {}".format(element, OcrBlock.level)) + raise ValueError(f"Some element {element} has level greater than this {OcrBlock.level}") paragraphs = [OcrParagraph.from_list(paragraph2elements[key], ocr_conf_thr) for key in sorted(paragraph2elements.keys())] return OcrBlock(paragraphs=paragraphs, order=head.block_num, bbox=head.bbox) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py index b98f74e3..64181a50 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py @@ -39,7 +39,7 @@ def from_list(line: List[OcrElement], ocr_conf_thr: float) -> "OcrLine": words = [] head = None for element in line: - assert element.level >= OcrLine.level, "get {} in line".format(element) + assert element.level >= OcrLine.level, f"get {element} in line" if element.level == OcrLine.level: head = element else: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_page.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_page.py index 42dde517..552ad596 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_page.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_page.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import List, Dict, Iterable +from typing import Dict, Iterable, List from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_block import OcrBlock from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_line import OcrLine diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_tuple.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_tuple.py index f7c0bab4..7211203e 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_tuple.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_tuple.py @@ -35,7 +35,7 @@ def __init__(self, self.block_num = block_num def __str__(self) -> str: - return "OcrTUPLE(level={}, conf={}, text={})".format(self.level, self.conf, self.text[:60]) + return f"OcrTUPLE(level={self.level}, conf={self.conf}, text={self.text[:60]})" def __repr__(self) -> str: return str(self) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_utils.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_utils.py index 6a56e276..bdad4148 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_utils.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_utils.py @@ -15,7 +15,7 @@ def get_cell_text_by_ocr(img_cell: np.ndarray, language: str) -> str: def get_text_from_table_cell(image: np.ndarray, language: str) -> str: config = "--psm 6" - text = pytesseract.image_to_string(image, lang=language, output_type=pytesseract.Output.DICT, config=config)['text'] + text = pytesseract.image_to_string(image, lang=language, output_type=pytesseract.Output.DICT, config=config)["text"] return text diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/paragraph_features.py b/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/paragraph_features.py index 427efb82..2d5fca1b 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/paragraph_features.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/paragraph_features.py @@ -1,12 +1,13 @@ import json from collections import deque -from typing import List, Optional, Any +from typing import Any, List, Optional + import numpy as np import pandas as pd -from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation from dedoc.data_structures.bbox import BBox +from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor from dedoc.utils.utils import list_get @@ -14,7 +15,7 @@ class ParagraphFeatureExtractor(AbstractFeatureExtractor): - def __init__(self, *, config: dict = None, **kwargs: Any) -> None: + def __init__(self, *, config: dict = None, **kwargs: Any) -> None: # noqa super().__init__() self.config = config if config is not None else {} @@ -81,10 +82,7 @@ def transform(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] df_results[col] /= (df_results[col].max() - df_results[col].min() + 1) return df_results[sorted(df_results.columns)] - def _relative_indent(self, - this_bbox: Optional[BBox], - prev_bbox: Optional[BBox], - left: bool = True) -> Optional[float]: + def _relative_indent(self, this_bbox: Optional[BBox], prev_bbox: Optional[BBox], left: bool = True) -> Optional[float]: if this_bbox is None or prev_bbox is None: return None elif left: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py index 87d46928..0dba5e81 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py @@ -2,6 +2,7 @@ import os import pickle from typing import List + from xgboost import XGBClassifier from dedoc.config import get_config diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index fa12f896..bffe9940 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -1,17 +1,18 @@ import logging import os from datetime import datetime -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple + import cv2 import numpy as np from dedoc.config import get_config -from dedoc.extensions import recognized_mimes, recognized_extensions -from dedoc.readers.pdf_reader.pdf_image_reader.adaptive_binarizer import AdaptiveBinarizer +from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable -from dedoc.readers.pdf_reader.pdf_base_reader import PdfBaseReader, ParametersForParseDoc +from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader +from dedoc.readers.pdf_reader.pdf_image_reader.adaptive_binarizer import AdaptiveBinarizer from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor from dedoc.readers.pdf_reader.pdf_image_reader.scan_rotator import ScanRotator @@ -81,21 +82,19 @@ def _process_one_page(self, # --- Step 3: table detection and recognition --- if parameters.need_pdf_table_analysis: - clean_image, tables = self.table_recognizer. \ - recognize_tables_from_image(image=rotated_image, - page_number=page_number, - language=parameters.language, - orient_analysis_cells=parameters.orient_analysis_cells, - orient_cell_angle=parameters.orient_cell_angle, - table_type=parameters.table_type) + clean_image, tables = self.table_recognizer.recognize_tables_from_image( + image=rotated_image, + page_number=page_number, + language=parameters.language, + orient_analysis_cells=parameters.orient_analysis_cells, + orient_cell_angle=parameters.orient_cell_angle, + table_type=parameters.table_type + ) else: clean_image, tables = rotated_image, [] # --- Step 4: plain text recognition and text style detection --- - page = self.ocr.split_image2lines(image=clean_image, - language=parameters.language, - is_one_column_document=is_one_column_document, - page_num=page_number) + page = self.ocr.split_image2lines(image=clean_image, language=parameters.language, is_one_column_document=is_one_column_document, page_num=page_number) lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page) if self.config.get("labeling_mode"): @@ -118,7 +117,7 @@ def _detect_columncount_and_orientation(self, image: np.ndarray, parameters: Par self.logger.info("Call orientation and columns classifier") columns, angle = self.column_orientation_classifier.predict(image) - self.logger.debug("Predict {}".format(angle)) + self.logger.debug(f"Predict {angle}") if columns is not None: self.logger.info(f"Final number of columns: {columns}") else: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/scan_rotator.py b/dedoc/readers/pdf_reader/pdf_image_reader/scan_rotator.py index fd15b97d..089cad7a 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/scan_rotator.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/scan_rotator.py @@ -1,5 +1,6 @@ import logging -from typing import List, Iterator +from typing import Iterator, List + import cv2 import numpy as np from joblib import Parallel, delayed @@ -45,7 +46,7 @@ def auto_rotate(self, image: np.ndarray, orientation_angle: int = 0) -> (np.ndar rotated = rotate_image(image, best_angle) if self.config.get("debug_mode"): - self.logger.debug(f'Best angle: {best_angle}, orientation angle: {orientation_angle}') + self.logger.debug(f"Best angle: {best_angle}, orientation angle: {orientation_angle}") return rotated, best_angle + orientation_angle def rotate(self, images: List[np.ndarray]) -> Iterator[np.ndarray]: @@ -55,5 +56,5 @@ def rotate(self, images: List[np.ndarray]) -> Iterator[np.ndarray]: n_jobs = self.config["n_jobs"] for batch in get_batch(size=n_jobs, iterable=images): rotated_ = Parallel(n_jobs=n_jobs)(delayed(self.auto_rotate)(img) for img in batch) - for res, angle in rotated_: + for res, _ in rotated_: yield res diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py index 2821e046..a6593496 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py @@ -1,4 +1,5 @@ -from typing import List, Dict, Tuple, Optional +from typing import Dict, List, Optional, Tuple + import numpy as np from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell @@ -49,10 +50,7 @@ def split(self, cells: List[List[Cell]]) -> List[List[Cell]]: # fill the result matrix for row in cells_fixed_boarder: for cell in row: - self.__split_one_cell(cell=cell, - horizontal_borders=horizontal_borders, - result_matrix=result_matrix, - vertical_borders=vertical_borders) + self.__split_one_cell(cell=cell, horizontal_borders=horizontal_borders, result_matrix=result_matrix, vertical_borders=vertical_borders) for row_id, row in enumerate(result_matrix): for col_id, cell in enumerate(row): @@ -116,19 +114,15 @@ def _merge_close_borders(self, cells: List[List[Cell]]) -> List[List[Cell]]: horizontal_dict = self.__get_border_dict(borders=horizontal_borders, threshold=eps_horizontal) vertical_dict = self.__get_border_dict(borders=vertical_borders, threshold=eps_vertical) result = [] - for row_id, row in enumerate(cells): + for row in cells: new_row = [] - for cell_id, cell in enumerate(row): + for cell in row: x_top_left = vertical_dict[cell.x_top_left] x_bottom_right = vertical_dict[cell.x_bottom_right] y_top_left = horizontal_dict[cell.y_top_left] y_bottom_right = horizontal_dict[cell.y_bottom_right] if y_top_left < y_bottom_right and x_top_left < x_bottom_right: - new_cell = Cell.copy_from(cell, - x_top_left=x_top_left, - x_bottom_right=x_bottom_right, - y_top_left=y_top_left, - y_bottom_right=y_bottom_right) + new_cell = Cell.copy_from(cell, x_top_left=x_top_left, x_bottom_right=x_bottom_right, y_top_left=y_top_left, y_bottom_right=y_bottom_right) new_row.append(new_cell) result.append(new_row) return result diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py index e3431a4a..59f802b8 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py @@ -1,9 +1,10 @@ import copy from typing import List + import numpy as np -from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_cell_extractor import OCRCellExtractor from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell +from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_cell_extractor import OCRCellExtractor from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.utils import get_cell_text_by_ocr @@ -54,9 +55,7 @@ def split_last_column(matrix_table: List[List[Cell]], language: str, image: np.a if row_id == len(last_column) - 1 and len(union_cells) > 1 or \ cell.id_con != prev_cell.id_con and len(union_cells) > 1: result_matrix[start_union_cell:start_union_cell + len(union_cells)] = \ - _split_each_row(union_cells, matrix_table[start_union_cell:start_union_cell + len(union_cells)], - language=language, - image=image) + _split_each_row(union_cells, matrix_table[start_union_cell:start_union_cell + len(union_cells)], language=language, image=image) union_cells = [cell] start_union_cell = -1 @@ -73,8 +72,7 @@ def split_last_column(matrix_table: List[List[Cell]], language: str, image: np.a return result_matrix -def _split_each_row(union_cells: List[Cell], matrix_table: List[List[Cell]], language: str, image: np.array) \ - -> List[List[Cell]]: +def _split_each_row(union_cells: List[Cell], matrix_table: List[List[Cell]], language: str, image: np.array) -> List[List[Cell]]: assert len(union_cells) == len(matrix_table) if len(matrix_table[0]) < 1: return matrix_table diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/base_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/base_table_extractor.py index 38741ee5..9b40f77b 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/base_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/base_table_extractor.py @@ -1,5 +1,6 @@ -from typing import List import logging +from typing import List + from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell @@ -13,13 +14,11 @@ def _print_matrix_table(self, matrix_table: List[List[Cell]]) -> None: for i in range(len(matrix_table)): string += " ".join([str(cell.id_con) for cell in matrix_table[i]]) string += "\n" - self.logger.debug("{}\nend table".format(string)) + self.logger.debug(f"{string}\nend table") def _print_table_attr(self, matrix_cells: List[List[Cell]]) -> None: string = "Table:\n" for i in range(0, len(matrix_cells)): - string += "\t".join([("{}/{}/{}".format(cell.id_con, - cell.is_attribute, - cell.is_attribute_required)) for cell in matrix_cells[i]]) + string += "\t".join([f"{cell.id_con}/{cell.is_attribute}/{cell.is_attribute_required}" for cell in matrix_cells[i]]) string += "\n" self.logger.debug(string) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py index 34b6b6bf..39517b2f 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py @@ -1,5 +1,5 @@ -from typing import List import logging +from typing import List from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation from dedoc.data_structures.line_with_meta import LineWithMeta @@ -16,20 +16,17 @@ def __init__(self, *, config: dict, logger: logging.Logger) -> None: super().__init__(config=config, logger=logger) self.single_tables = [[]] # simple tables on all pages - def extract_multipage_tables(self, - single_tables: List[ScanTable], - lines_with_meta: List[LineWithMeta]) -> List[ScanTable]: + def extract_multipage_tables(self, single_tables: List[ScanTable], lines_with_meta: List[LineWithMeta]) -> List[ScanTable]: self.single_tables = single_tables multipages_tables = [] list_page_with_tables = [] total_pages = max((table.page_number + 1 for table in single_tables), default=0) for cur_page in range(total_pages): - # 1. get possible diaposon of neighbors pages with tables - # распределение по страницам + # 1. get possible diapason of neighbors pages with tables + # pages distribution list_mp_table = [t for t in self.single_tables if t.page_number == cur_page] list_page_with_tables.append(list_mp_table) - # проход по всем таблицам. Основной цикл обработки. total_cur_page = 0 if total_pages == 1: for tbls in list_page_with_tables: @@ -39,58 +36,21 @@ def extract_multipage_tables(self, while total_cur_page < total_pages: begin_page = total_cur_page - # если нет таблиц на текущей странице + # if tables are not found on the current page if len(list_page_with_tables[begin_page]) == 0: total_cur_page += 1 continue - # последняя таблица на текущей странице может иметь продолжение - # начинаем анализ на слияние таблиц + # table merging analysis t1 = list_page_with_tables[begin_page][-1] - # цикл по следующим страницам - finish = False # условие выхода анализа текущей многостраничной таблицы + # next pages cycle cur_page = begin_page + 1 - - while not finish: - # условия выхода - if cur_page == total_pages: # достигнут конец документа - finish = True - continue - - if len(list_page_with_tables[cur_page]) == 0: # нет таблиц на текущей странице - finish = True - continue - - # рассматриваем первую страницу на текущей странице - t2 = list_page_with_tables[cur_page][0] - if self.config.get("debug_mode", False): - self.logger.debug("cur page: {}".format(cur_page)) - - # проверка что t2 является продолжением t1 - if self.__is_one_table(t1, t2): - # таблица присоединяется к первой - t1.extended(t2) - list_page_with_tables[cur_page].pop(0) - self.__delete_ref_table(lines=lines_with_meta, table_name=t2.name) - else: - if len(list_page_with_tables[cur_page]) > 0: - cur_page -= 1 # чтобы начать с этой страницы анализ, а не со следующей - finish = True - continue - - if not finish: - # если несколько таблиц на странице, то завершаем объединение многостраничной таблицы - if len(list_page_with_tables[cur_page]) > 0: - cur_page -= 1 # чтобы начать с этой страницы анализ, а не со следующей - finish = True - else: # продолжаем обход - cur_page += 1 - - total_cur_page = cur_page + 1 # анализ следующей страницы + cur_page = self.__handle_multipage_table(cur_page, lines_with_meta, list_page_with_tables, t1, total_pages) + total_cur_page = cur_page + 1 multipages_tables.extend(list_page_with_tables[begin_page][:-1]) - multipages_tables.append(t1) # добавление многостраничной таблицы + multipages_tables.append(t1) list_page_with_tables[begin_page] = [] for page in range(begin_page + 1, min(cur_page + 1, total_pages)): if len(list_page_with_tables[page]) > 0: @@ -99,6 +59,48 @@ def extract_multipage_tables(self, return multipages_tables + def __handle_multipage_table(self, + cur_page: int, + lines_with_meta: List[LineWithMeta], + list_page_with_tables: List[List[ScanTable]], + t1: ScanTable, + total_pages: int) -> int: + finish = False # multipage table finished + while not finish: + if cur_page == total_pages: # end of the document + finish = True + continue + + if len(list_page_with_tables[cur_page]) == 0: # tables are not found on the current page + finish = True + continue + + # first table on the current page + t2 = list_page_with_tables[cur_page][0] + if self.config.get("debug_mode", False): + self.logger.debug(f"cur page: {cur_page}") + + # t2 is continuation of t1 + if self.__is_one_table(t1, t2): + # t2 is merged with t1 + t1.extended(t2) + list_page_with_tables[cur_page].pop(0) + self.__delete_ref_table(lines=lines_with_meta, table_name=t2.name) + else: + if len(list_page_with_tables[cur_page]) > 0: + cur_page -= 1 # analysis from the current page, not the next one + finish = True + continue + + if not finish: + # if there are several tables on the current page, end of parsing of the current multipage table + if len(list_page_with_tables[cur_page]) > 0: + cur_page -= 1 # analysis from the current page, not the next one + finish = True + else: + cur_page += 1 + return cur_page + def __delete_ref_table(self, lines: List[LineWithMeta], table_name: str) -> None: for line in lines: for num, ann in enumerate(line.annotations): @@ -129,28 +131,28 @@ def __is_equal_width_cells(self, table_part_1: List[List[Cell]], table_part_2: L width_cell2 = self.__get_width_cell_wo_separating(table_part_2[0]) for i in range(0, len(width_cell1)): - eps = max(4, int(width_cell1[i] * 0.1)) # +-1% от ширины погрешность + eps = max(4, int(width_cell1[i] * 0.1)) # error +-1% from the width if len(width_cell2) <= i or (not equal_with_eps(width_cell1[i], width_cell2[i], eps)): if self.config.get("debug_mode", False): - self.logger.debug("1 - {}".format(width_cell1[i])) - self.logger.debug("2 - {}".format(width_cell2[i])) - self.logger.debug("eps = {}".format(eps)) + self.logger.debug(f"1 - {width_cell1[i]}") + self.logger.debug(f"2 - {width_cell2[i]}") + self.logger.debug(f"eps = {eps}") return False return True def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool: - # condition 1. Width1 == Width2. Ширина таблиц должна совпадать + # condition 1. Width1 == Width2. Tables widths should be equal width1 = abs(t1.locations[-1].bbox.width) width2 = abs(t2.locations[0].bbox.width) - eps_width = int(width1 * 0.03) # в диапозоне +-1% от ширины погрешность + eps_width = int(width1 * 0.03) # error +-1% from width if not equal_with_eps(width1, width2, eps_width): if self.config.get("debug_mode", False): self.logger.debug("Different width tables") - self.logger.debug("w1, w2, eps = {}, {}, {}".format(width1, width2, eps_width)) + self.logger.debug(f"w1, w2, eps = {width1}, {width2}, {eps_width}") return False - # condition 2. исключение дублированного заголовка (если он есть) + # condition 2. Exclusion of the duplicated header (if any) attr1 = TableAttributeExtractor.get_header_table(t1.matrix_cells) attr2 = TableAttributeExtractor.get_header_table(t2.matrix_cells) if TableAttributeExtractor.is_equal_attributes(attr1, attr2): @@ -159,16 +161,15 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool: if len(t2.matrix_cells) == 0 or len(t1.matrix_cells) == 0: return False - # очищаем признак аттрибутов у второй части таблицы TableAttributeExtractor.clear_attributes(t2.matrix_cells) - # condition 3. количество столбцов должно совпадать + # condition 3. Number of columns should be equal if len(t1.matrix_cells[-1]) != len(t2.matrix_cells[0]): if self.config.get("debug_mode", False): self.logger.debug("Different count column") return False - # condition 4. сравнение ширин столбцов последнего и первого рядов + # condition 4. Comparison of the widths of last and first rows if not self.__is_equal_width_cells(t1.matrix_cells, t2.matrix_cells): if self.config.get("debug_mode", False): self.logger.debug("Different width columns") diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py index 66371185..050f006e 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py @@ -2,6 +2,7 @@ import logging import uuid from typing import List + import numpy as np from dedoc.data_structures.bbox import BBox @@ -76,7 +77,7 @@ def __detect_diff_orient(self, cell_text: str) -> bool: :return: True if cell is vertical and False otherwise """ # 1 - разбиваем на строки длины которых состоят хотя бы из одного символа - parts = cell_text.split('\n') + parts = cell_text.split("\n") parts = [p for p in parts if len(p) > 0] # 2 - подсчитываем среднюю длину строк ячейки @@ -84,8 +85,8 @@ def __detect_diff_orient(self, cell_text: str) -> bool: avg_len_part = np.average(len_parts) # Эвристика: считаем что ячейка повернута, если у нас большое количество строк и строки короткие - if len(parts) > self.config['minimal_cell_cnt_line'] \ - and avg_len_part < self.config['minimal_cell_avg_length_line']: + if len(parts) > self.config["minimal_cell_cnt_line"] \ + and avg_len_part < self.config["minimal_cell_avg_length_line"]: return True return False @@ -107,9 +108,7 @@ def __analyze_header_cell_with_diff_orient(self, tables: List[ScanTable], langua for i, row in enumerate(attrs): for j, attr in enumerate(row): if self.__detect_diff_orient(attr.text): - rotated_cell, rotated_image = self.__correct_orient_cell(attr, - language=language, - rotated_angle=rotated_angle) + rotated_cell, rotated_image = self.__correct_orient_cell(attr, language=language, rotated_angle=rotated_angle) table.matrix_cells[i][j] = rotated_cell return tables @@ -152,10 +151,7 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable: bbox = BBox(table_tree.data_bb[0], table_tree.data_bb[1], table_tree.data_bb[2], table_tree.data_bb[3]) - matrix_table = ScanTable(matrix_cells=matrix, - bbox=bbox, - page_number=self.page_number, - name=str(uuid.uuid1())) + matrix_table = ScanTable(matrix_cells=matrix, bbox=bbox, page_number=self.page_number, name=str(uuid.uuid1())) return matrix_table @@ -172,16 +168,13 @@ def __build_structure_table_from_tree(self, tables_tree: TableTree, table_type: cur_table.matrix_cells = self.splitter.split(cells=cur_table.matrix_cells) # Эвристика 2: таблица должна иметь больше одного столбца - if len(cur_table.matrix_cells[0]) > 1 or \ - (self.table_options.detect_one_cell_table in table_type and cur_table.matrix_cells[0] != []): + if len(cur_table.matrix_cells[0]) > 1 or (self.table_options.detect_one_cell_table in table_type and cur_table.matrix_cells[0] != []): tables.append(cur_table) if self.table_options.split_last_column in table_type: - cur_table.matrix_cells = split_last_column(cur_table.matrix_cells, - language=self.language, - image=self.image) + cur_table.matrix_cells = split_last_column(cur_table.matrix_cells, language=self.language, image=self.image) except Exception as ex: - self.logger.warning("Warning: unrecognized table into page {}. {}".format(self.page_number, ex)) + self.logger.warning(f"Warning: unrecognized table into page {self.page_number}. {ex}") if self.config.get("debug_mode", False): raise ex return tables diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py index 21b1ee16..c2c37780 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py @@ -52,7 +52,7 @@ def clear_attributes(matrix_table: List[List[Cell]]) -> None: def __is_indexable_column(self, matrix_table: List[List[Cell]], column_id: int, max_raw_of_search: int) -> bool: # № п/п for i in range(0, max_raw_of_search + 1): - if column_id < len(matrix_table[i]) and '№' in matrix_table[i][column_id].text and len( + if column_id < len(matrix_table[i]) and "№" in matrix_table[i][column_id].text and len( matrix_table[i][column_id].text) < len("№ п/п\n"): return True return False @@ -88,7 +88,7 @@ def __is_empty_row(self, matrix_table: List[List[Cell]], row_index: int) -> bool def __analyze_attr_for_vertical_union_columns(self, scan_table: ScanTable) -> List[int]: vertical_union_columns = [] if len(vertical_union_columns) != 0 and len(scan_table.matrix_cells) > 1: - self.logger.debug('ATTR_TYPE: vertical union table') + self.logger.debug("ATTR_TYPE: vertical union table") row_max_attr = 1 i = 1 @@ -127,16 +127,14 @@ def __analyze_attr_for_horizontal_union_raws(self, scan_table: ScanTable) -> Lis # один (1) - с обязательными поляями, один (2) - с необязательными # поэтому len(matrix_table) > first_required_column + 2 if len(horizontal_union_rows) > 0 and \ - self.__is_indexable_column(scan_table.matrix_cells, first_required_column, - max_raw_of_search=horizontal_union_rows[-1]) \ + self.__is_indexable_column(scan_table.matrix_cells, first_required_column, max_raw_of_search=horizontal_union_rows[-1]) \ and len(scan_table.matrix_cells) > first_required_column + 2: scan_table.matrix_cells[0][first_required_column + 1].is_attribute_required = True # Полностью пустые строки не могут быть атрибутами (не информативны) # Перенос атрибутов на след строку таблицы index_empty_rows = horizontal_union_rows[-1] - if self.__is_empty_row(scan_table.matrix_cells, index_empty_rows) and len( - scan_table.matrix_cells) != index_empty_rows + 1: + if self.__is_empty_row(scan_table.matrix_cells, index_empty_rows) and len(scan_table.matrix_cells) != index_empty_rows + 1: horizontal_union_rows.append(index_empty_rows + 1) for j in range(0, len(scan_table.matrix_cells[index_empty_rows + 1])): scan_table.matrix_cells[index_empty_rows + 1][j].is_attribute = True @@ -144,7 +142,7 @@ def __analyze_attr_for_horizontal_union_raws(self, scan_table: ScanTable) -> Lis return horizontal_union_rows def __analyze_attr_for_simple_table(self, scan_table: ScanTable) -> None: - self.logger.debug('ATTR_TYPE: simple table') + self.logger.debug("ATTR_TYPE: simple table") for j in range(0, len(scan_table.matrix_cells[0])): scan_table.matrix_cells[0][j].is_attribute = True # set first required column @@ -160,6 +158,5 @@ def __analyze_attr_for_simple_table(self, scan_table: ScanTable) -> None: # один один столбец должен быть (0) - нумерованным, # один (1) - с обязательными поляями, один (2) - с необязательными # поэтому len(matrix_table) > first_required_column + 2 - if self.__is_indexable_column(scan_table.matrix_cells, first_required_column, 0) and len( - scan_table.matrix_cells) > first_required_column + 2: + if self.__is_indexable_column(scan_table.matrix_cells, first_required_column, 0) and len(scan_table.matrix_cells) > first_required_column + 2: scan_table.matrix_cells[0][first_required_column + 1].is_attribute_required = True diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py index 7f0c1cfb..1934ba9e 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py @@ -1,26 +1,25 @@ import json import logging +import os +from typing import List, Optional, Tuple + import cv2 -from PIL import Image import numpy as np -from typing import List, Tuple, Optional -import os +from PIL import Image -from dedoc.train_dataset.data_path_config import table_path as save_path -from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.bbox import BBox +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.multipage_table_extractor import MultiPageTableExtractor from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import OnePageTableExtractor +from dedoc.train_dataset.data_path_config import table_path as save_path -'''-------------------------------------entry class of Table Recognizer Module---------------------------------------''' +"""-------------------------------------entry class of Table Recognizer Module---------------------------------------""" class TableRecognizer(object): - def __init__(self, - *, - config: dict = None) -> None: + def __init__(self, *, config: dict = None) -> None: self.logger = config.get("logger", logging.getLogger(__name__)) @@ -34,12 +33,9 @@ def __init__(self, if not os.path.exists(self.config["path_detect"]): os.makedirs(self.config["path_detect"]) - def convert_to_multipages_tables(self, - all_single_tables: List[ScanTable], - lines_with_meta: List[LineWithMeta]) -> List[ScanTable]: + def convert_to_multipages_tables(self, all_single_tables: List[ScanTable], lines_with_meta: List[LineWithMeta]) -> List[ScanTable]: - multipage_tables = self.multipage_tables_extractor.extract_multipage_tables(single_tables=all_single_tables, - lines_with_meta=lines_with_meta) + multipage_tables = self.multipage_tables_extractor.extract_multipage_tables(single_tables=all_single_tables, lines_with_meta=lines_with_meta) return multipage_tables def recognize_tables_from_image(self, @@ -49,9 +45,10 @@ def recognize_tables_from_image(self, orient_analysis_cells: bool, orient_cell_angle: int, table_type: str = "") -> Tuple[np.ndarray, List[ScanTable]]: - self.logger.debug("Page %i" % page_number) + self.logger.debug(f"Page {page_number}") try: - cleaned_image, matrix_tables = self.__rec_tables_from_img(image, page_num=page_number, + cleaned_image, matrix_tables = self.__rec_tables_from_img(image, + page_num=page_number, language=language, orient_analysis_cells=orient_analysis_cells, orient_cell_angle=orient_cell_angle, @@ -130,8 +127,7 @@ def __filter_bad_tables(self, tables: List[ScanTable], image: np.ndarray) -> Lis def __if_not_table(self, table: ScanTable, image: np.ndarray) -> bool: bbox = table.location.bbox height, width = image.shape - table_image = image[max(bbox.y_top_left, 0): min(bbox.y_bottom_right, height), - max(bbox.x_top_left, 0): min(bbox.x_bottom_right, width)] + table_image = image[max(bbox.y_top_left, 0): min(bbox.y_bottom_right, height), max(bbox.x_top_left, 0): min(bbox.x_bottom_right, width)] mean = table_image.mean() std = table_image.std() white_mean = (table_image > 225).mean() @@ -143,11 +139,7 @@ def __if_not_table(self, table: ScanTable, image: np.ndarray) -> bool: cells_area += cell.width * cell.height ratio = cells_area / table_area - res = ((white_mean < 0.5) or - (black_mean > 0.3) or - (std < 30) or (mean < 150) or - (mean < 200 and std < 80) or - ratio < 0.65) + res = (white_mean < 0.5) or (black_mean > 0.3) or (std < 30) or (mean < 150) or (mean < 200 and std < 80) or ratio < 0.65 return res def __save_tables(self, tables: List[ScanTable], image: np.ndarray, table_path: Optional[str] = None) -> None: @@ -155,8 +147,8 @@ def __save_tables(self, tables: List[ScanTable], image: np.ndarray, table_path: os.makedirs(table_path, exist_ok=True) for table in tables: images_cnt = len(os.listdir(table_path)) - image_path = os.path.join(table_path, "{:06d}.png".format(images_cnt)) - jsons_path = os.path.join(table_path, "{:06d}.json".format(images_cnt)) + image_path = os.path.join(table_path, f"{images_cnt:06d}.png") + jsons_path = os.path.join(table_path, f"{images_cnt:06d}.json") image.save(image_path) with open(jsons_path, "w") as out: json.dump(obj=table.to_dict(), fp=out, indent=4, ensure_ascii=False) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py index 3b617726..f18b7505 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py @@ -1,23 +1,19 @@ -import cv2 -from typing import List, Tuple -import os import csv import json +import os +from typing import List, Tuple +import cv2 + +from dedoc.config import get_config from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader -from dedoc.config import get_config def _create_cell(c: str, text_cells: list) -> Cell: - cell = Cell( - x_bottom_right=-1, - x_top_left=-1, - y_top_left=-1, - y_bottom_right=-1, - ) - if 'a' in c: + cell = Cell(x_bottom_right=-1, x_top_left=-1, y_top_left=-1, y_bottom_right=-1) + if "a" in c: cell.is_attribute = True # loading cell text if len(text_cells) != 0: @@ -30,16 +26,16 @@ def _create_cell(c: str, text_cells: list) -> Cell: def load_from_csv(path_csv: str, path_class_2_csv: str = "") -> List[List[Cell]]: text_cells = [] if path_class_2_csv != "": - csv_file_class_2 = open(path_class_2_csv, 'r', newline='') + csv_file_class_2 = open(path_class_2_csv, "r", newline="") reader_class_2 = csv.reader(csv_file_class_2) text_cells = [r for r in reader_class_2] matrix = [] - with open(path_csv, 'r', newline='') as csv_file: + with open(path_csv, "r", newline="") as csv_file: reader = csv.reader(csv_file) for raw in reader: - if len(raw) >= 5 and raw[0] == 'bbox': + if len(raw) >= 5 and raw[0] == "bbox": pass else: line = [_create_cell(c, text_cells) for c in raw if c != ""] @@ -85,24 +81,18 @@ def draw_recognized_cell(tables: List[ScanTable], path_image: str, path_save: st cv2.rectangle(img, (bbox.x_top_left, bbox.y_top_left), (bbox.width, bbox.height), blue_color, 6) for i in range(0, len(table)): for j in range(0, len(table[i])): - cv2.rectangle(img, (table[i][j].x_top_left, table[i][j].y_top_left), (table[i][j].x_bottom_right, table[i][j].y_bottom_right), red_color, - 4) - cv2.putText(img, str(table[i][j].id_con), (table[i][j].x_top_left, table[i][j].y_bottom_right), cv2.FONT_HERSHEY_PLAIN, 4, - green_color) + cv2.rectangle(img, (table[i][j].x_top_left, table[i][j].y_top_left), (table[i][j].x_bottom_right, table[i][j].y_bottom_right), red_color, 4) + cv2.putText(img, str(table[i][j].id_con), (table[i][j].x_top_left, table[i][j].y_bottom_right), cv2.FONT_HERSHEY_PLAIN, 4, green_color) cv2.imwrite(path_save, img) def save_json(tables: List[ScanTable], number_test_string: str, path_output: str) -> None: for i in range(0, len(tables)): - with open(path_output + "{}_table_{}.json".format(number_test_string, i), "w") as out: + with open(f"{path_output}{number_test_string}_table_{i}.json", "w") as out: json.dump(tables[i].to_dict(), out, ensure_ascii=False, indent=2) -def calc_accuracy(path_image: str, - path_GT_struct: str, - path_GT_text: str, - path_save_image: str, - path_save_json: str) -> None: +def calc_accuracy(path_image: str, path_gt_struct: str, path_gt_text: str, path_save_image: str, path_save_json: str) -> None: from os import listdir from os.path import isfile, join @@ -112,20 +102,20 @@ def calc_accuracy(path_image: str, image_files = [f for f in listdir(path_image) if isfile(join(path_image, f))] agreements = [] - for i in range(0, len(image_files)): # цикл по экземплярам - name_example = image_files[i].split('.')[0].split('_')[0] + for image_file in image_files: + name_example = image_file.split(".")[0].split("_")[0] # predict tables - image = cv2.imread(path_image + image_files[i], 0) + image = cv2.imread(path_image + image_file, 0) # TODO fix this clean_images, tables = PdfImageReader(config=get_config()).get_tables([image]) - draw_recognized_cell(tables, path_image + image_files[i], path_save_image + image_files[i]) + draw_recognized_cell(tables, path_image + image_file, path_save_image + image_file) save_json(tables, name_example, path_save_json) - gt_files = [f for f in listdir(path_GT_struct) if isfile(join(path_GT_struct, f)) and name_example + "_" in f] - for index_table in range(0, len(gt_files)): # цикл по таблицам на изображении + gt_files = [f for f in listdir(path_gt_struct) if isfile(join(path_gt_struct, f)) and name_example + "_" in f] + for index_table in range(0, len(gt_files)): - csv_filename = path_GT_struct + name_example + "_" + str(index_table + 1) + ".csv" - csv_text_filename = path_GT_text + name_example + "_" + str(index_table + 1) + "_text.csv" + csv_filename = path_gt_struct + name_example + "_" + str(index_table + 1) + ".csv" + csv_text_filename = path_gt_text + name_example + "_" + str(index_table + 1) + "_text.csv" if os.path.exists(csv_filename): if not os.path.exists(csv_text_filename): csv_text_filename = "" @@ -142,7 +132,7 @@ def calc_accuracy(path_image: str, if __name__ == "__main__": - current_path = os.path.dirname(__file__) + '/' + current_path = os.path.dirname(__file__) + "/" calc_accuracy(current_path + "../../backend/test_dataset_table/images/", current_path + "../../backend/test_dataset_table/GT_struct/", current_path + "../../backend/test_dataset_table/GT_text/", diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/draw_result_table.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/draw_result_table.py index f7364fbf..34856339 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/draw_result_table.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/draw_result_table.py @@ -1,6 +1,7 @@ import argparse import glob import os + import cv2 import numpy as np @@ -18,15 +19,13 @@ if not os.path.exists(parser.output_folder): os.makedirs(parser.output_folder, exist_ok=True) - path_to_img = [parser.image] if parser.image is not None else glob.glob("{}/*".format(parser.input_folder)) + path_to_img = [parser.image] if parser.image is not None else glob.glob(f"{parser.input_folder}/*") for img_path in path_to_img: image = cv2.imread(img_path) color_backgr = np.max(image) padding = 40 - image_bigger = np.full((image.shape[0] + padding * 2, - image.shape[1] + padding * 2, - image.shape[2]), color_backgr) + image_bigger = np.full((image.shape[0] + padding * 2, image.shape[1] + padding * 2, image.shape[2]), color_backgr) image_bigger[padding:-padding, padding:-padding] = image # TODO fix this clean_images, tables = PdfImageReader(config={}).get_tables([cv2.cvtColor(image_bigger, cv2.COLOR_BGR2GRAY)]) @@ -44,4 +43,4 @@ if len(path_to_img) == 1: cv2.imwrite(parser.output_folder, res) else: - cv2.imwrite("{}/{}".format(parser.output_folder, img_path.split('/')[-1]), res) + cv2.imwrite(f"{parser.output_folder}/{img_path.split('/')[-1]}", res) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py index e072c0bb..4a1307ca 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py @@ -1,12 +1,13 @@ -from typing import Any, Tuple, List -import cv2 +import logging import math -import numpy as np import os -import logging +from typing import Any, List, Tuple + +import cv2 +import numpy as np from dedoc.config import get_config -from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree, ContourCell +from dedoc.readers.pdf_reader.data_classes.tables.table_tree import ContourCell, TableTree from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions from dedoc.utils.image_utils import rotate_image @@ -27,16 +28,16 @@ def rotate_with_threshold(img: np.ndarray, angle: float, threshold: float = None # Algorithm for finding lines by Houph. Allows you to eliminate gaps between lines and find the angle of the table -def apply_HouphLine(img: np.ndarray, threshold_gap: int = 10, *, config: dict) -> Tuple[np.ndarray, int]: - cdstP = np.copy(img) +def apply_houph_line(img: np.ndarray, threshold_gap: int = 10, *, config: dict) -> Tuple[np.ndarray, int]: + cdst_p = np.copy(img) dst = abs(img - 255) - linesP = cv2.HoughLinesP(dst, 1, np.pi / 180, 50, 100, 300, threshold_gap) + lines_p = cv2.HoughLinesP(dst, 1, np.pi / 180, 50, 100, 300, threshold_gap) k_hor = [] - if linesP is not None: - for i in range(0, len(linesP)): - line = linesP[i][0] + if lines_p is not None: + for i in range(0, len(lines_p)): + line = lines_p[i][0] # k - angle of line in degree if abs(line[0] - line[2]) == 0: k = math.atan(0) * 180.0 / math.pi @@ -45,9 +46,9 @@ def apply_HouphLine(img: np.ndarray, threshold_gap: int = 10, *, config: dict) - if abs(k) < 5: k_hor.append(k) - cv2.line(cdstP, (line[0], line[1]), (line[2], line[3]), (0, 0, 255), 1, cv2.LINE_AA) + cv2.line(cdst_p, (line[0], line[1]), (line[2], line[3]), (0, 0, 255), 1, cv2.LINE_AA) if (abs(k) < 95) and (abs(k) > 85): - cv2.line(cdstP, (line[0], line[1]), (line[2], line[3]), (0, 0, 255), 1, cv2.LINE_AA) + cv2.line(cdst_p, (line[0], line[1]), (line[2], line[3]), (0, 0, 255), 1, cv2.LINE_AA) angle = np.sum(k_hor) / len(k_hor) if len(k_hor) > 0 else 0 @@ -55,9 +56,9 @@ def apply_HouphLine(img: np.ndarray, threshold_gap: int = 10, *, config: dict) - angle = 0 if config.get("debug_mode", False): - logger.debug("angle_horiz_avg = {}".format(angle)) + logger.debug(f"angle_horiz_avg = {angle}") - return cdstP, angle + return cdst_p, angle def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [Any, Any, np.ndarray, float]: @@ -83,7 +84,7 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An # step 2 img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables") # step 3 - img_final_bin_houph, angle_alignment = __apply_Houph_lines_and_detect_angle(img_final_bin, config) + img_final_bin_houph, angle_alignment = __apply_houph_lines_and_detect_angle(img_final_bin, config) (thresh, img_final_bin_houph) = cv2.threshold(img_final_bin_houph, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) if config.get("debug_mode", False): @@ -104,19 +105,16 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An if config.get("debug_mode", False): cv2.imwrite(os.path.join(config["path_detect"], "img_houph_and_morph_wo_bound.jpg"), img_final_bin_houph) img_w_contour = img.copy() - cv2.drawContours(img_w_contour, contours, contourIdx=-1, color=(0, 0, 0), thickness=10, hierarchy=hierarchy, - maxLevel=8) + cv2.drawContours(img_w_contour, contours, contourIdx=-1, color=(0, 0, 0), thickness=10, hierarchy=hierarchy, maxLevel=8) cv2.imwrite(os.path.join(config["path_detect"], "img_with_contours.jpg"), img_w_contour) # Draw external contours for tables without external contours. It is a rare case, but important for invoices if table_options.table_wo_external_bounds in table_type: - contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), - contours, hierarchy) + contours, hierarchy = __get_contours_for_table_wo_external_bounds(img, img_final_bin_houph.copy(), contours, hierarchy) return contours, hierarchy, img, angle_alignment -def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, - contours: List, hierarchy: List) -> [Any, Any]: +def __get_contours_for_table_wo_external_bounds(img: np.ndarray, img_with_contours: np.ndarray, contours: List, hierarchy: List) -> [Any, Any]: # get children (get table counters) contours = np.array(contours) list_contours, table_contours = __get_table_contours(contours, hierarchy) @@ -153,22 +151,17 @@ def __filter_table(image: np.ndarray, table_image: np.ndarray) -> bool: table_area = table_image.shape[0] * table_image.shape[1] image_area = image.shape[0] * image.shape[1] - res = ((white_mean < 0.5) or - (black_mean > 0.3) or - (std < 30) or (mean < 150) or - (mean < 200 and std < 80) or - (table_area < image_area * 0.2)) - + res = (white_mean < 0.5) or (black_mean > 0.3) or (std < 30) or (mean < 150) or (mean < 200 and std < 80) or (table_area < image_area * 0.2) return res def __get_table_contours(contours: np.ndarray, hierarchy: List) -> [np.ndarray, np.ndarray]: - list_contours = np.array([id for id, h in enumerate(hierarchy[0]) if h[3] == 0], dtype=int) + list_contours = np.array([h_id for h_id, h in enumerate(hierarchy[0]) if h[3] == 0], dtype=int) return list_contours, contours[list_contours] -def __apply_Houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np.ndarray, float]: +def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np.ndarray, float]: # ----- search height, width table ----- # # ----- detect gap for houph ------- # contours, hierarchy = cv2.findContours(image, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS) @@ -180,11 +173,11 @@ def __apply_Houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np else: gap_avg = 5 if config["debug_mode"]: - config.get("logger", logging.getLogger()).debug("Houph gap = {}".format(gap_avg)) + config.get("logger", logging.getLogger()).debug(f"Houph gap = {gap_avg}") # ----- image alignment ----- # Houph apply - img_final_bin_houph, angle_alignment = apply_HouphLine(image, gap_avg, config=config) + img_final_bin_houph, angle_alignment = apply_houph_line(image, gap_avg, config=config) return img_final_bin_houph, angle_alignment @@ -198,8 +191,8 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta elif task == "tables": length_div = 55 height_div = 100 - kernel_length_weight = max(np.array(img_bin).shape[1] // length_div, config['min_w_cell']) # 35 - kernel_length_height = max(np.array(img_bin).shape[0] // height_div, config['min_h_cell']) # 100 + kernel_length_weight = max(np.array(img_bin).shape[1] // length_div, config["min_w_cell"]) # 35 + kernel_length_height = max(np.array(img_bin).shape[0] // height_div, config["min_h_cell"]) # 100 # A verticle kernel of (1 X kernel_length), which will detect all the verticle lines from the image. verticle_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length_height)) @@ -221,10 +214,10 @@ def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, ta cv2.imwrite(os.path.join(config["path_detect"], "verticle_lines.jpg"), verticle_lines_img) cv2.imwrite(os.path.join(config["path_detect"], "horizontal_lines.jpg"), horizontal_lines_img) - '''Now we will add these two images. + """Now we will add these two images. This will have only boxes and the information written in the box will be erased. So we can accurately detect the boxes and no noise will occur for false box extraction - ''' + """ # Weighting parameters, this will decide the quantity of an image to be added to make a new image. alpha = 0.5 beta = 1.0 - alpha @@ -270,7 +263,7 @@ def detect_tables_by_contours(img: np.ndarray, tree_table = TableTree.parse_contours_to_tree(contours=contours, hierarchy=hierarchy, config=config) if config.get("debug_mode", False): - config.get("logger", logging.getLogger()).debug("Hierarchy [Next, Previous, First_Child, Parent]:\n {}".format(hierarchy)) + config.get("logger", logging.getLogger()).debug(f"Hierarchy [Next, Previous, First_Child, Parent]:\n {hierarchy}") tree_table.print_tree(depth=0) if config.get("debug_mode", False): cv2.imwrite(os.path.join(config["path_detect"], "img_draw_counters.jpg"), img) @@ -281,7 +274,7 @@ def detect_tables_by_contours(img: np.ndarray, x, y, w, h = cv2.boundingRect(c) new_img = img[y:y + h, x:x + w] if config.get("debug_mode", False): - cv2.imwrite(os.path.join(config["path_cells"], str(ind) + '.png'), new_img) + cv2.imwrite(os.path.join(config["path_cells"], str(ind) + ".png"), new_img) cell_images.append(ContourCell(id_con=ind, image=new_img)) tree_table.set_text_into_tree(tree_table, diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py index 5f506618..a4a79ae0 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py @@ -1,4 +1,5 @@ import difflib + import numpy as np from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_from_table_cell @@ -42,7 +43,7 @@ def similarity(s1: str, s2: str) -> float: def detect_diff_orient(cell_text: str) -> bool: # 1 - разбиваем на строки длины которых состоят хотя бы из одного символа - parts = cell_text.split('\n') + parts = cell_text.split("\n") parts = [p for p in parts if len(p) > 0] # 2 - подсчитываем среднюю длину строк ячейки diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/extractor_pdf_textlayer.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/extractor_pdf_textlayer.py index fa4eb61d..5f742f0a 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/extractor_pdf_textlayer.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/extractor_pdf_textlayer.py @@ -5,33 +5,32 @@ import re import uuid from collections import namedtuple -from typing import List, IO, Tuple, Match, Optional +from typing import IO, List, Match, Optional, Tuple + import cv2 import numpy as np from PIL import Image - -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException -from dedoc.data_structures.annotation import Annotation -from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation -from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation -from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation -from dedoc.data_structures.concrete_annotations.style_annotation import StyleAnnotation from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTChar, LTAnno, LTTextBoxHorizontal, LTTextLineHorizontal, LTContainer, LTRect, \ - LTFigure, LTImage, LTCurve, LTTextBox +from pdfminer.layout import LAParams, LTAnno, LTChar, LTContainer, LTCurve, LTFigure, LTImage, LTRect, LTTextBox, LTTextBoxHorizontal, LTTextLineHorizontal from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfpage import PDFPage -from dedoc.utils.pdf_utils import get_page_image +from dedoc.common.exceptions.bad_file_error import BadFileFormatError +from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.bbox import BBox +from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation +from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation +from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation +from dedoc.data_structures.concrete_annotations.style_annotation import StyleAnnotation from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.location import Location from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox +from dedoc.utils.pdf_utils import get_page_image StyleLine = namedtuple("StyleLine", ["begin", "end", "bold", "italic", "font_size", "font_style", "table_name"]) -logging.getLogger('pdfminer').setLevel(logging.ERROR) +logging.getLogger("pdfminer").setLevel(logging.ERROR) class ExtractorPdfTextLayer(object): @@ -50,13 +49,12 @@ def extract_text_layer(self, path: str, page_number: int, is_one_column_document :param path: path to pdf :return: pages_with_bbox - page with extracted text """ - with open(path, 'rb') as fp: + with open(path, "rb") as fp: pages = PDFPage.get_pages(fp) for page_num, page in enumerate(pages): if page_num != page_number: continue - return self.__handle_page(page=page, page_number=page_number, path=path, - is_one_column_document=is_one_column_document) + return self.__handle_page(page=page, page_number=page_number, path=path, is_one_column_document=is_one_column_document) def __handle_page(self, page: PDFPage, page_number: int, path: str, is_one_column_document: bool) -> PageWithBBox: directory = os.path.dirname(path) @@ -64,7 +62,7 @@ def __handle_page(self, page: PDFPage, page_number: int, path: str, is_one_colum try: interpreter.process_page(page) except Exception as e: - raise BadFileFormatException("can't handle file {} get {}".format(path, e)) + raise BadFileFormatError(f"can't handle file {path} get {e}") layout = device.get_result() image_page = self.__get_image(path=path, page_num=page_number) @@ -97,12 +95,7 @@ def __handle_page(self, page: PDFPage, page_number: int, path: str, is_one_colum bboxes = [] for line_num, lobj in enumerate(lobjs_textline): - bbox = self.get_info_layout_object(lobj, - page_num=page_number, - line_num=line_num, - k_w=k_w, - k_h=k_h, - height=height) + bbox = self.get_info_layout_object(lobj, page_num=page_number, line_num=line_num, k_w=k_w, k_h=k_h, height=height) if bbox.bbox.width * bbox.bbox.height > 0: bboxes.append(bbox) @@ -121,16 +114,12 @@ def __extract_image(self, bbox = self._create_bbox(k_h=k_h, k_w=k_w, height=height, lobj=lobj) location = Location(bbox=bbox, page_number=page_number) cropped = image_page[bbox.y_top_left: bbox.y_bottom_right, bbox.x_top_left: bbox.x_bottom_right] - uid = "fig_{}".format(uuid.uuid1()) - file_name = "{}.png".format(uid) + uid = f"fig_{uuid.uuid1()}" + file_name = f"{uid}.png" path_out = os.path.join(directory, file_name) Image.fromarray(cropped).save(path_out) image_page[bbox.y_top_left: bbox.y_bottom_right, bbox.x_top_left: bbox.x_bottom_right] = 255 - attachment = PdfImageAttachment(original_name=file_name, - tmp_file_path=path_out, - need_content_analysis=False, - uid=uid, - location=location) + attachment = PdfImageAttachment(original_name=file_name, tmp_file_path=path_out, need_content_analysis=False, uid=uid, location=location) except Exception as ex: self.logger.error(ex) attachment = None @@ -148,22 +137,14 @@ def __get_image(path: str, page_num: int) -> np.ndarray: def __get_interpreter(self, is_one_column_document: bool) -> Tuple[PDFPageAggregator, PDFPageInterpreter]: rsrcmgr = PDFResourceManager() if is_one_column_document is not None and is_one_column_document: - laparams = LAParams(line_margin=3.0, line_overlap=0.1, boxes_flow=0.5, word_margin=1.5, char_margin=100.0, - detect_vertical=False) + laparams = LAParams(line_margin=3.0, line_overlap=0.1, boxes_flow=0.5, word_margin=1.5, char_margin=100.0, detect_vertical=False) else: - laparams = LAParams(line_margin=1.5, line_overlap=0.5, boxes_flow=0.5, word_margin=0.1, - detect_vertical=False) + laparams = LAParams(line_margin=1.5, line_overlap=0.5, boxes_flow=0.5, word_margin=0.1, detect_vertical=False) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return device, interpreter - def __debug_extract_layout(self, - image_src: np.ndarray, - layout: LTContainer, - page_num: int, - k_w: float, - k_h: float, - page: PDFPage) -> None: + def __debug_extract_layout(self, image_src: np.ndarray, layout: LTContainer, page_num: int, k_w: float, k_h: float, page: PDFPage) -> None: """ Function for debugging of pdfminer.six layout :param layout: container of layout element @@ -173,7 +154,7 @@ def __debug_extract_layout(self, if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) - file_text = open(os.path.join(tmp_dir, "text_{}.txt".format(page_num)), "wt") + file_text = open(os.path.join(tmp_dir, f"text_{page_num}.txt"), "wt") # 1. extract layout objects lobjs = [lobj for lobj in layout] @@ -208,7 +189,7 @@ def __debug_extract_layout(self, self.__draw_layout_element(image_src, lobjs_images, file_text, k_w, k_h, page, (0, 255, 255), text="LTImage") self.__draw_layout_element(image_src, lobjs_curves, file_text, k_w, k_h, page, (0, 255, 255), text="LTCurve") - cv2.imwrite(os.path.join(tmp_dir, "img_page_{}.png".format(page_num)), image_src) + cv2.imwrite(os.path.join(tmp_dir, f"img_page_{page_num}.png"), image_src) file_text.close() def __draw_layout_element(self, @@ -220,17 +201,14 @@ def __draw_layout_element(self, page: PDFPage, color: Tuple[int, int, int], text: Optional[str] = None) -> None: - for line_num, lobj in enumerate(lobjs): + for lobj in lobjs: # converting coordinate from pdf format into image box_lobj = ExtractorPdfTextLayer.convert_coordinates_pdf_to_image(lobj, k_w, k_h, page.mediabox[3]) - cv2.rectangle(image_src, - (box_lobj.x_top_left, box_lobj.y_top_left), - (box_lobj.x_bottom_right, box_lobj.y_bottom_right), color) + cv2.rectangle(image_src, (box_lobj.x_top_left, box_lobj.y_top_left), (box_lobj.x_bottom_right, box_lobj.y_bottom_right), color) if text is not None: - cv2.putText(image_src, text, (box_lobj.x_top_left, box_lobj.y_top_left), cv2.FONT_HERSHEY_SIMPLEX, 1, - color) + cv2.putText(image_src, text, (box_lobj.x_top_left, box_lobj.y_top_left), cv2.FONT_HERSHEY_SIMPLEX, 1, color) else: file.write(lobj.get_text()) @@ -243,13 +221,7 @@ def convert_coordinates_pdf_to_image(lobj: LTContainer, k_w: float, k_h: float, return BBox(x0_new, y0_new, x1_new - x0_new, y1_new - y0_new) - def get_info_layout_object(self, - lobj: LTContainer, - page_num: int, - line_num: int, - k_w: float, - k_h: float, - height: int) -> TextWithBBox: + def get_info_layout_object(self, lobj: LTContainer, page_num: int, line_num: int, k_w: float, k_h: float, height: int) -> TextWithBBox: # 1 - converting coordinate from pdf format into image bbox = self._create_bbox(height, k_h, k_w, lobj) # 2 - extract text and text annotations from current object @@ -258,8 +230,7 @@ def get_info_layout_object(self, def _create_bbox(self, height: int, k_h: float, k_w: float, lobj: LTContainer) -> BBox: curr_box_line = ExtractorPdfTextLayer.convert_coordinates_pdf_to_image(lobj, k_w, k_h, height) - bbox = BBox.from_two_points((curr_box_line.x_top_left, curr_box_line.y_top_left), - (curr_box_line.x_bottom_right, curr_box_line.y_bottom_right)) + bbox = BBox.from_two_points((curr_box_line.x_top_left, curr_box_line.y_top_left), (curr_box_line.x_bottom_right, curr_box_line.y_bottom_right)) return bbox def _get_style_and_text_from_layout_object(self, lobj: LTContainer) -> [str, List[Annotation]]: @@ -279,21 +250,19 @@ def _get_line_style(self, lobj: LTTextLineHorizontal) -> List[Annotation]: chars_with_style = [] rand_weight = self._get_new_weight() prev_style = "" - for item, lobj_char in enumerate(lobj): + for lobj_char in lobj: if isinstance(lobj_char, LTChar) or isinstance(lobj_char, LTAnno): if len(chars_with_style) > 0: # check next char different from previously then we fresh rand_weight - prev_style, prev_size = chars_with_style[-1].split('_rand_') + prev_style, prev_size = chars_with_style[-1].split("_rand_") if isinstance(lobj_char, LTChar): - curr_style = "{}_{}".format(lobj_char.fontname, round(lobj_char.size, 0)) + curr_style = f"{lobj_char.fontname}_{round(lobj_char.size, 0)}" if curr_style != prev_style: rand_weight = self._get_new_weight() - chars_with_style.append("{}_rand_{}".format(curr_style, rand_weight)) - elif isinstance(lobj_char, LTAnno) \ - and lobj_char.get_text() in (' ', '\n') \ - and len(chars_with_style) > 0: + chars_with_style.append(f"{curr_style}_rand_{rand_weight}") + elif isinstance(lobj_char, LTAnno) and lobj_char.get_text() in (" ", "\n") and len(chars_with_style) > 0: # check on the space or \n (in pdfminer is type LTAnno) # duplicated previous style chars_with_style.append(chars_with_style[-1]) @@ -320,30 +289,30 @@ def _cleaning_text_from_hieroglyphics(self, text_str: str) -> str: def cid_recognized(self, m: Match) -> str: v = m.group(0) - v = v.strip('(') - v = v.strip(')') - ascii_num = v.split(':')[-1] + v = v.strip("(") + v = v.strip(")") + ascii_num = v.split(":")[-1] ascii_num = int(ascii_num) text_val = chr(ascii_num) return text_val def _get_new_weight(self) -> str: - return binascii.hexlify(os.urandom(8)).decode('ascii') + return binascii.hexlify(os.urandom(8)).decode("ascii") def __parse_style_string(self, chars_with_meta: str, begin: int, end: int) -> List[Annotation]: # style parsing line_anns = [] - prev_style, _ = chars_with_meta.split('_rand_') - font, size, *_ = prev_style.split('_') - fontname_wo_rand = font.split('+')[-1] - styles = fontname_wo_rand.split('-')[-1] + prev_style, _ = chars_with_meta.split("_rand_") + font, size, *_ = prev_style.split("_") + fontname_wo_rand = font.split("+")[-1] + styles = fontname_wo_rand.split("-")[-1] if "Bold" in styles: line_anns.append(BoldAnnotation(begin, end, value="True")) if "Italic" in styles: line_anns.append(ItalicAnnotation(begin, end, value="True")) line_anns.append(StyleAnnotation(begin, end, value=fontname_wo_rand)) - if size.replace('.', '', 1).isnumeric(): + if size.replace(".", "", 1).isnumeric(): line_anns.append(SizeAnnotation(begin, end, value=size)) return line_anns diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index e1d6cecb..d7b34db0 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -9,6 +9,7 @@ from dedoc.common.exceptions.java_not_found_error import JavaNotFoundError from dedoc.common.exceptions.tabby_pdf_error import TabbyPdfError +from dedoc.data_structures import BBoxAnnotation from dedoc.data_structures.bbox import BBox from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation @@ -54,10 +55,7 @@ def __init__(self, *, config: dict) -> None: self.tabby_java_version = "2.0.0" self.jar_name = "ispras_tbl_extr.jar" self.jar_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "tabbypdf", "jars")) - self.java_not_found_error = ( - "`java` command is not found from this Python process." - "Please ensure Java is installed and PATH is set for `java`" - ) + self.java_not_found_error = "`java` command is not found from this Python process. Please ensure Java is installed and PATH is set for `java`" self.default_config = {"JAR_PATH": os.path.join(self.jar_dir, self.jar_name)} def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: @@ -167,6 +165,9 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith annotations.append(IndentationAnnotation(0, len_block, str(indent))) annotations.append(SpacingAnnotation(0, len_block, str(spacing))) + bbox = BBox.from_two_points((bx_top_left, by_top_left), (bx_bottom_right, by_bottom_right)) + annotations.append(BBoxAnnotation(0, len_block, bbox)) + for annotation in block["annotations"]: is_bold = annotation["is_bold"] is_italic = annotation["is_italic"] @@ -176,9 +177,15 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith url = annotation["url"] start = annotation["start"] end = annotation["end"] - + wx_top_left = annotation["x_top_left"] + wy_top_left = annotation["y_top_left"] + wx_bottom_right = wx_top_left + annotation["width"] + wy_bottom_right = wy_top_left + annotation["height"] + wbbox = BBox.from_two_points((wx_top_left, wy_top_left), + (wx_bottom_right, wy_bottom_right)) annotations.append(SizeAnnotation(start, end, str(font_size))) annotations.append(StyleAnnotation(start, end, font_name)) + annotations.append(BBoxAnnotation, start, end, wbbox) if is_bold: annotations.append(BoldAnnotation(start, end, "True")) @@ -190,8 +197,7 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith annotations.append(LinkedTextAnnotation(start, end, url)) meta = block["metadata"].lower() - uid = "txt_{}_{}".format(file_hash, order) - bbox = BBox.from_two_points((bx_top_left, by_top_left), (bx_bottom_right, by_bottom_right)) + uid = f"txt_{file_hash}_{order}" metadata = LineMetadata(page_id=page_number, line_id=order) line_with_location = LineWithLocation(line=block_text, @@ -229,7 +235,7 @@ def __run(self, path: str = None, encoding: str = "utf-8", try: result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.DEVNULL, check=True) if result.stderr: - self.logger.warning("Got stderr: {}".format(result.stderr.decode(encoding))) + self.logger.warning(f"Got stderr: {result.stderr.decode(encoding)}") return result.stdout except FileNotFoundError: raise JavaNotFoundError(self.java_not_found_error) @@ -238,7 +244,7 @@ def __run(self, path: str = None, encoding: str = "utf-8", def __process_pdf(self, path: str, start_page: int = None, end_page: int = None) -> dict: output = self.__run(path=path, start_page=start_page, end_page=end_page) - response = output.decode('UTF-8') + response = output.decode("UTF-8") document = json.loads(response) if response else {} return document diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index 2884ac2c..21bf6943 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -1,14 +1,15 @@ import os -from typing import Optional, Tuple, List +from typing import List, Optional, Tuple + import numpy as np -from dedoc.readers.pdf_reader.pdf_txtlayer_reader.extractor_pdf_textlayer import ExtractorPdfTextLayer +from dedoc.data_structures.bbox import BBox from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable -from dedoc.readers.pdf_reader.pdf_base_reader import PdfBaseReader, ParametersForParseDoc +from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.extractor_pdf_textlayer import ExtractorPdfTextLayer from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox -from dedoc.data_structures.bbox import BBox class PdfTxtlayerReader(PdfBaseReader): @@ -35,6 +36,7 @@ def can_read(self, path: str, mime: str, extension: str, document_type: Optional Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ + parameters = {} if parameters is None else parameters return extension.lower().endswith("pdf") and (str(parameters.get("pdf_with_text_layer", "false")).lower() == "true") def _process_one_page(self, @@ -44,22 +46,20 @@ def _process_one_page(self, path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]: gray_image = self._convert_to_gray(image) if parameters.need_pdf_table_analysis: - cleaned_image, tables = self.table_recognizer. \ - recognize_tables_from_image(image=gray_image, - page_number=page_number, - language=parameters.language, - orient_analysis_cells=parameters.orient_analysis_cells, - orient_cell_angle=parameters.orient_cell_angle, - table_type=parameters.table_type) + cleaned_image, tables = self.table_recognizer.recognize_tables_from_image( + image=gray_image, + page_number=page_number, + language=parameters.language, + orient_analysis_cells=parameters.orient_analysis_cells, + orient_cell_angle=parameters.orient_cell_angle, + table_type=parameters.table_type + ) else: tables = [] - is_one_column_document_list = None if parameters.is_one_column_document_list is None \ - else parameters.is_one_column_document_list[page_number] + is_one_column_document_list = None if parameters.is_one_column_document_list is None else parameters.is_one_column_document_list[page_number] - page = self.extractor_layer.extract_text_layer(path=path, - page_number=page_number, - is_one_column_document=is_one_column_document_list) + page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number, is_one_column_document=is_one_column_document_list) if page is None: return [], [], [] unreadable_blocks = [location.bbox for table in tables for location in table.locations] diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/tabbypdf.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/tabbypdf.py index 83c83114..6a8421ad 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/tabbypdf.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/tabbypdf.py @@ -3,21 +3,16 @@ import subprocess from asyncio.log import logger +from dedoc.common.exceptions.java_not_found_error import JavaNotFoundError + TABBY_JAVA_VERSION = "2.0.0" JAR_NAME = "ispras_tbl_extr.jar" JAR_DIR = os.path.abspath(os.path.dirname(__file__)) -JAVA_NOT_FOUND_ERROR = ( - "`java` command is not found from this Python process." - "Please ensure Java is installed and PATH is set for `java`" -) +JAVA_NOT_FOUND_ERROR = "`java` command is not found from this Python process. Please ensure Java is installed and PATH is set for `java`" DEFAULT_CONFIG = {"JAR_PATH": os.path.join(JAR_DIR, JAR_NAME)} -class JavaNotFoundError(Exception): - pass - - def _jar_path() -> str: return os.environ.get("TABBY_JAR", DEFAULT_CONFIG["JAR_PATH"]) @@ -26,25 +21,19 @@ def _run(path: str = None, encoding: str = "utf-8") -> bytes: args = ["java"] + ["-jar", _jar_path(), "-i", path] try: - result = subprocess.run( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - stdin=subprocess.DEVNULL, - check=True, - ) + result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.DEVNULL, check=True) if result.stderr: - logger.warning("Got stderr: {}".format(result.stderr.decode(encoding))) + logger.warning(f"Got stderr: {result.stderr.decode(encoding)}") return result.stdout except FileNotFoundError: raise JavaNotFoundError(JAVA_NOT_FOUND_ERROR) except subprocess.CalledProcessError as e: - logger.error("Error from tabby-java:\n{}\n".format(e.stderr.decode(encoding))) + logger.error(f"Error from tabby-java:\n{e.stderr.decode(encoding)}\n") raise def extract(path: str) -> dict: output = _run(path) - response = output.decode('UTF-8') + response = output.decode("UTF-8") document = json.loads(response) return document diff --git a/dedoc/readers/pdf_reader/utils/header_footers_analysis.py b/dedoc/readers/pdf_reader/utils/header_footers_analysis.py index f7874b14..30e2c567 100644 --- a/dedoc/readers/pdf_reader/utils/header_footers_analysis.py +++ b/dedoc/readers/pdf_reader/utils/header_footers_analysis.py @@ -1,14 +1,15 @@ -from typing import List, Optional -import numpy as np import difflib import re from collections import Counter +from typing import List, Optional, Tuple + +import numpy as np from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation def _get_pattern(s: str) -> str: - return re.sub(r'[0-9]+', '@', s.lower().strip()) + return re.sub(r"\d+", "@", s.lower().strip()) def _similarity_with_replacement(s1: str, s2: str) -> float: @@ -42,9 +43,10 @@ def _strip_empty_lines(lines: List[List[LineWithLocation]]) -> List[List[LineWit return lines -def _remove_header_footer(is_footer_header: List[bool], popular_patterns: List[List[str]], - lines: List[List[LineWithLocation]], page_id: int, line_id: int) \ - -> Optional[LineWithLocation]: +def _remove_header_footer(is_footer_header: List[bool], + popular_patterns: List[List[str]], + lines: List[List[LineWithLocation]], + page_id: int, line_id: int) -> Optional[LineWithLocation]: if not is_footer_header[line_id] or abs(line_id) >= len(lines[page_id]): return None @@ -58,8 +60,7 @@ def _remove_header_footer(is_footer_header: List[bool], popular_patterns: List[L return None -def _get_popular_pattern(is_footer_header: List[bool], max_cnt_lines: int, threshold: float, patterns: List[List[str]]) \ - -> List[List[str]]: +def _get_popular_pattern(is_footer_header: List[bool], max_cnt_lines: int, threshold: float, patterns: List[List[str]]) -> List[List[str]]: # Algorithm if header takes more than 40% of changed header-footer of doc # and more 70% in the doc with const header-footers # is_footer_header = [True, False, False, False, True, True ] @@ -68,10 +69,10 @@ def _get_popular_pattern(is_footer_header: List[bool], max_cnt_lines: int, thres popular_patterns = [[] for _ in range(max_cnt_lines)] - for num, patterns_on_line in enumerate(patterns): + for num, pattern in enumerate(patterns): if not is_footer_header[num]: continue - filter_pattern = [p for p in patterns[num] if p != '' and p] + filter_pattern = [p for p in pattern if p] uniques = np.array(list(Counter(filter_pattern).keys())) freqs = np.array(list(Counter(filter_pattern).values())) / len(filter_pattern) @@ -80,8 +81,8 @@ def _get_popular_pattern(is_footer_header: List[bool], max_cnt_lines: int, thres return popular_patterns -def footer_header_analysis(lines: List[List[LineWithLocation]], threshold: float = 0.5) -> \ - [List[List[LineWithLocation]], List[List[LineWithLocation]], List[List[LineWithLocation]]]: +def footer_header_analysis(lines: List[List[LineWithLocation]], threshold: float = 0.5) \ + -> Tuple[List[List[LineWithLocation]], List[List[LineWithLocation]], List[List[LineWithLocation]]]: # 1. инициализация весов, окна скольжения и скоров # first 4 weight for header, last 4 weight for footer weights = [1.0, 1.0, 0.85, 0.75, 0.75, 0.85, 1.0, 1.0] @@ -116,13 +117,11 @@ def footer_header_analysis(lines: List[List[LineWithLocation]], threshold: float # calc score for header for line_index in range(max_cnt_lines // 2): # calculation header score - scores[line_index] += weights[line_index] * _similarity_with_replacement(s1=patterns[line_index][page_one], - s2=patterns[line_index][page_two]) + scores[line_index] += weights[line_index] * _similarity_with_replacement(s1=patterns[line_index][page_one], s2=patterns[line_index][page_two]) # calculation footer score - scores[-line_index - 1] += weights[-line_index - 1] * _similarity_with_replacement( - s1=patterns[-line_index - 1][page_one], - s2=patterns[-line_index - 1][page_two]) + similarity = _similarity_with_replacement(s1=patterns[-line_index - 1][page_one], s2=patterns[-line_index - 1][page_two]) + scores[-line_index - 1] += weights[-line_index - 1] * similarity cnt_cmpr += 1 @@ -130,9 +129,7 @@ def footer_header_analysis(lines: List[List[LineWithLocation]], threshold: float is_footer_header = scores > threshold # 4 - get the popular pattern from lines with high scores - popular_patterns = _get_popular_pattern(is_footer_header, max_cnt_lines, - threshold=0.4 if step_hf == 2 else 0.7, - patterns=patterns) + popular_patterns = _get_popular_pattern(is_footer_header, max_cnt_lines, threshold=0.4 if step_hf == 2 else 0.7, patterns=patterns) # 5 - delete only those lines which match with popular patterns headers, footers = [], [] diff --git a/dedoc/readers/pdf_reader/utils/line_object_linker.py b/dedoc/readers/pdf_reader/utils/line_object_linker.py index 46a2cc85..8d9a9125 100644 --- a/dedoc/readers/pdf_reader/utils/line_object_linker.py +++ b/dedoc/readers/pdf_reader/utils/line_object_linker.py @@ -1,12 +1,12 @@ import logging -from collections import deque, defaultdict -from typing import List, Union, Dict +from collections import defaultdict, deque +from typing import Dict, List, Union +from dedoc.data_structures.bbox import BBox from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation -from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.bbox import BBox +from dedoc.data_structures.line_metadata import LineMetadata from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.location import Location @@ -23,10 +23,7 @@ def __init__(self, *, config: dict) -> None: self.config = config self.logger = config.get("logger", logging.getLogger()) - def link_objects(self, - lines: List[LineWithLocation], - tables: List[ScanTable], - images: List[PdfImageAttachment]) -> List[LineWithLocation]: + def link_objects(self, lines: List[LineWithLocation], tables: List[ScanTable], images: List[PdfImageAttachment]) -> List[LineWithLocation]: """ add annotations to lines. Add annotations with links to the tables, images and other objects. Add spacing links to the lines @@ -36,13 +33,8 @@ def link_objects(self, @return: """ if len(lines) == 0: - metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel.create_raw_text(), - page_id=0, - line_id=0) - lines = [LineWithLocation(line="", - metadata=metadata, - annotations=[], - location=Location(page_number=0, bbox=BBox(0, 0, 1, 1)))] + metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel.create_raw_text(), page_id=0, line_id=0) + lines = [LineWithLocation(line="", metadata=metadata, annotations=[], location=Location(page_number=0, bbox=BBox(0, 0, 1, 1)))] last_page_line = self._get_last_page_line(lines) all_objects = list(lines + tables + images) all_objects.sort(key=lambda o: (o.order, o.location)) @@ -50,7 +42,7 @@ def link_objects(self, self._add_lines(all_objects, "previous_lines", objects_with_line_candidate) self._add_lines(all_objects[::-1], "next_lines", objects_with_line_candidate) - for uid, object_with_lines in objects_with_line_candidate.items(): + for object_with_lines in objects_with_line_candidate.values(): page_object = object_with_lines["object"] best_line = self._find_closest_line(page_object=page_object, lines_before=object_with_lines["previous_lines"], @@ -61,9 +53,9 @@ def link_objects(self, elif isinstance(page_object, PdfImageAttachment): annotation = AttachAnnotation(attach_uid=page_object.uid, start=0, end=len(best_line.line)) else: - self.logger.warning("Unsupported page object type {}".format(page_object)) + self.logger.warning(f"Unsupported page object type {page_object}") if self.config.get("debug_mode", False): - raise Exception("Unsupported page object type {}".format(page_object)) + raise Exception(f"Unsupported page object type {page_object}") best_line.annotations.append(annotation) # noqa return lines @@ -93,8 +85,7 @@ def _find_closest_line(self, @return: best line to link with object """ all_lines = lines_before + lines_after - line_on_same_page = [line for line in all_lines - if line.location.page_number == page_object.location.page_number] + line_on_same_page = [line for line in all_lines if line.location.page_number == page_object.location.page_number] # no one line on the same page if len(line_on_same_page) == 0: previous_page_id = page_object.location.page_number - 1 @@ -105,8 +96,7 @@ def _find_closest_line(self, return max(lines_prev_page, key=lambda line: line.location) else: return min(all_lines, key=lambda line: line.location) - line_with_distance = [(self._distance_bboxes(line, page_object.location.bbox), line) - for line in line_on_same_page] + line_with_distance = [(self._distance_bboxes(line, page_object.location.bbox), line) for line in line_on_same_page] return min(line_with_distance, key=lambda t: t[0])[1] @staticmethod @@ -115,8 +105,7 @@ def _distance_bboxes(line: LineWithLocation, object_bbox: BBox) -> float: calculate the "distance between two bboxes" """ line_bbox = line.location.bbox - vertical_distance_abs = min(abs(line_bbox.y_top_left - object_bbox.y_bottom_right), - abs(line_bbox.y_bottom_right - object_bbox.y_top_left)) + vertical_distance_abs = min(abs(line_bbox.y_top_left - object_bbox.y_bottom_right), abs(line_bbox.y_bottom_right - object_bbox.y_top_left)) vertical_distance = vertical_distance_abs / (object_bbox.height + 1e-3) # calculate horizontal intersection diff --git a/dedoc/readers/pptx_reader/pptx_reader.py b/dedoc/readers/pptx_reader/pptx_reader.py index 51709420..4abb4679 100644 --- a/dedoc/readers/pptx_reader/pptx_reader.py +++ b/dedoc/readers/pptx_reader/pptx_reader.py @@ -1,5 +1,6 @@ import os from typing import Optional + from pptx import Presentation from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor diff --git a/dedoc/readers/reader_composition.py b/dedoc/readers/reader_composition.py index a49f07bc..8332b140 100644 --- a/dedoc/readers/reader_composition.py +++ b/dedoc/readers/reader_composition.py @@ -1,13 +1,11 @@ -import inspect import os -import warnings from typing import Dict, List -from dedoc.common.exceptions.bad_file_exception import BadFileFormatException +from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader -from dedoc.utils.utils import splitext_, get_file_mime_type +from dedoc.utils.utils import get_file_mime_type, splitext_ class ReaderComposition(object): @@ -39,13 +37,7 @@ def parse_file(self, tmp_dir: str, filename: str, parameters: Dict[str, str]) -> document_type = parameters.get("document_type") for reader in self.readers: - if "parameters" in inspect.getfullargspec(reader.can_read).args: - can_read = reader.can_read(path=file_path, mime=mime, extension=extension, document_type=document_type, parameters=parameters) - else: - warnings.warn("!WARNING! you reader requires an update\n" + - "Please specify parameters argument in method can_read in {}\n".format(reader) + - " This parameters would be mandatory in the near future") - can_read = reader.can_read(path=file_path, mime=mime, extension=extension, document_type=document_type) + can_read = reader.can_read(path=file_path, mime=mime, extension=extension, document_type=document_type, parameters=parameters) if can_read: unstructured_document = reader.read(path=file_path, document_type=document_type, parameters=parameters) @@ -53,7 +45,7 @@ def parse_file(self, tmp_dir: str, filename: str, parameters: Dict[str, str]) -> assert isinstance(unstructured_document, UnstructuredDocument) # TODO remove return unstructured_document - raise BadFileFormatException( + raise BadFileFormatError( msg=f"No one can read file: name = {filename}, extension = {extension}, mime = {mime}, document type = {document_type}", msg_api=f"Unsupported file format {mime} of the input file {filename}" ) diff --git a/dedoc/readers/txt_reader/raw_text_reader.py b/dedoc/readers/txt_reader/raw_text_reader.py index 5f69eaaf..d52ae567 100644 --- a/dedoc/readers/txt_reader/raw_text_reader.py +++ b/dedoc/readers/txt_reader/raw_text_reader.py @@ -3,7 +3,7 @@ import logging import re import time -from typing import Optional, Tuple, Iterable, List +from typing import Iterable, List, Optional, Tuple from unicodedata import normalize from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation @@ -44,7 +44,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio parameters = {} if parameters is None else parameters encoding = self.__get_encoding(path=path, parameters=parameters) lines = self._get_lines_with_meta(path=path, encoding=encoding) - encoding_warning = "encoding is {}".format(encoding) + encoding_warning = f"encoding is {encoding}" result = UnstructuredDocument(lines=lines, tables=[], attachments=[], warnings=[encoding_warning]) return self._postprocess(result) @@ -63,11 +63,11 @@ def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]: for line_id, line in self.__get_lines(path=path, encoding=encoding): if time.time() - previous_log_time > 5: - self.logger.info("done {} lines".format(line_id)) + self.logger.info(f"done {line_id} lines") previous_log_time = time.time() metadata = LineMetadata(page_id=0, line_id=line_id) - uid = "txt_{}_{}".format(file_hash, line_id) + uid = f"txt_{file_hash}_{line_id}" spacing_annotation_value = str(int(100 * (0.5 if number_of_empty_lines == 0 else number_of_empty_lines))) spacing_annotation = SpacingAnnotation(start=0, end=len(line), value=spacing_annotation_value) indent_annotation = self.__get_indent_annotation(line) @@ -88,13 +88,13 @@ def __get_lines(self, path: str, encoding: str) -> Iterable[Tuple[int, str]]: if path.lower().endswith("txt"): with codecs.open(path, errors="ignore", encoding=encoding) as file: for line_id, line in enumerate(file): - line = normalize('NFC', line).replace("й", "й") # й replace matter + line = normalize("NFC", line).replace("й", "й") # й replace matter yield line_id, line else: with gzip.open(path) as file: for line_id, line in enumerate(file): line = line.decode(encoding) - line = normalize('NFC', line).replace("й", "й") + line = normalize("NFC", line).replace("й", "й") yield line_id, line def __get_starting_spacing(self, line: Optional[LineWithMeta]) -> int: diff --git a/dedoc/scripts/benchmark_tl_correctness.py b/dedoc/scripts/benchmark_tl_correctness.py index 6258f9f4..b51fdc6e 100644 --- a/dedoc/scripts/benchmark_tl_correctness.py +++ b/dedoc/scripts/benchmark_tl_correctness.py @@ -5,9 +5,9 @@ import requests import wget +from config import get_config from tqdm import tqdm -from config import get_config from dedoc.utils.utils import send_file path_result = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")) diff --git a/dedoc/scripts/calc_tesseract_benchmarks.py b/dedoc/scripts/calc_tesseract_benchmarks.py index 77656bae..0db13299 100644 --- a/dedoc/scripts/calc_tesseract_benchmarks.py +++ b/dedoc/scripts/calc_tesseract_benchmarks.py @@ -1,15 +1,15 @@ -import os import argparse -import zipfile -from typing import List, Dict -from texttable import Texttable +import os import re -import pytesseract -import cv2 -import numpy as np -from tempfile import TemporaryDirectory import shutil +import zipfile +from tempfile import TemporaryDirectory +from typing import Dict, List +import cv2 +import numpy as np +import pytesseract +from texttable import Texttable parser = argparse.ArgumentParser() parser.add_argument("--input_path", "-i", type=str, default="../../resources/benchmarks/data_tesseract_benchmarks.zip") diff --git a/dedoc/scripts/create_txtlayer_dataset.py b/dedoc/scripts/create_txtlayer_dataset.py index 0714b4ed..744307b6 100644 --- a/dedoc/scripts/create_txtlayer_dataset.py +++ b/dedoc/scripts/create_txtlayer_dataset.py @@ -8,7 +8,7 @@ import numpy as np import requests -from PIL import ImageFont, Image, ImageDraw +from PIL import Image, ImageDraw, ImageFont from bs4 import BeautifulSoup from tqdm import tqdm diff --git a/dedoc/scripts/train/train_acc_orientation_classifier.py b/dedoc/scripts/train/train_acc_orientation_classifier.py index e22458ef..71c456c8 100644 --- a/dedoc/scripts/train/train_acc_orientation_classifier.py +++ b/dedoc/scripts/train/train_acc_orientation_classifier.py @@ -12,7 +12,6 @@ from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.dataset_executor import DataLoaderImageOrient - parser = argparse.ArgumentParser() checkpoint_path_save = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../resources/efficient_net_b0_fixed.pth")) diff --git a/dedoc/scripts/train/train_catboost_detect_tl_correctness.py b/dedoc/scripts/train/train_catboost_detect_tl_correctness.py new file mode 100644 index 00000000..84b9c4b6 --- /dev/null +++ b/dedoc/scripts/train/train_catboost_detect_tl_correctness.py @@ -0,0 +1,133 @@ +import os +from pathlib import Path +from typing import List + +import pandas as pd +from catboost import CatBoostClassifier, Pool +from sklearn.metrics import f1_score +import gzip +import pickle + +from dedoc.readers.pdf_reader.pdf_auto_reader.catboost_model_extractor import CatboostModelExtractor + + +class GetTextAndTarget: + """ + The GetTextAndTarget class is used for loading and processing text data from correct and incorrect text files. + """ + def __init__(self, path_correct_texts: str, path_incorrect_texts: str) -> None: + self.path_correct_texts = self.make_path(Path(path_correct_texts)) + self.path_incorrect_texts = self.make_path(Path(path_incorrect_texts)) + self.path_all = self.path_correct_texts + self.path_incorrect_texts + + def make_path(self, path: Path) -> List[str]: + path_all = [] + if path.is_dir(): + for subdir in path.iterdir(): + for subsubdir in subdir.iterdir(): + path_all.append(str(subsubdir)) + else: + print("Empty dir ", path) + return path_all + + def __len__(self) -> int: + return len(self.path_all) + + def __getitem__(self, item: int) -> dict: + try: + with open(self.path_all[item], mode="r") as f: + text = f.read() + except Exception as e: + print(f'Bad file {str(e)}: ', self.path_all[item]) + + try: + if len(text.strip()) == 0: + raise Exception('Empty file') + except Exception as error: + print('Caught this error: ' + str(error)) + + label = 1 if self.path_all[item] in str(self.path_correct_texts) else 0 + + return {"text": text, "label": label} + + +class GetFeaturesFromText(CatboostModelExtractor): + """ + The GetFeaturesFromText class is used for extracting features from text data. + """ + def __init__(self, *, config: dict) -> None: + super().__init__(config=config) + + def __len__(self) -> int: + return len(self.list_symbols) + + def get_feature(self, correct_data_path: str, not_correct_data_path: str) -> dict: + """ + Generate features and labels for the given dataset. + :param correct_data_path: Path to the directory containing correct text files. + :param not_correct_data_path: Path to the directory containing incorrect text files. + :returns: a dictionary containing features and labels. + """ + dataset = GetTextAndTarget(path_correct_texts=correct_data_path, path_incorrect_texts=not_correct_data_path) + label = [] + features = [] + for data in dataset: + list_of_sub = [] + num_letters_in_data = self._count_letters(data["text"]) + num_other_symboll_in_data = self._count_other(data["text"]) + for symbol in self.list_letters: + if num_letters_in_data != 0: + list_of_sub.append(round(data["text"].count(symbol) / num_letters_in_data, 5)) + else: + list_of_sub.append(0.0) + for symbol in self.list_symbols: + list_of_sub.append(data["text"].count(symbol)) + list_of_sub.append(num_letters_in_data + num_other_symboll_in_data / len(data["text"]) if len(data["text"]) != 0 else 0) + features.append(list_of_sub) + label.append(data["label"]) + return {"features": features, "label": label} + + def get_need_dataframe(self, correct_data_path: str, not_correct_data_path: str, csv_name: str) -> pd.DataFrame: + """ + Create a DataFrame from the given dataset and save it as a CSV file. + :param correct_data_path: Path to the directory containing correct text files. + :param not_correct_data_path: Path to the directory containing incorrect text files. + :param csv_name: Name of the output CSV file. + :returns: The generated DataFrame. + """ + features = self.get_feature(correct_data_path=correct_data_path, not_correct_data_path=not_correct_data_path) + df = pd.DataFrame(features["features"]) + df.to_csv(csv_name, sep='\t', index=False) + return df + + +def train() -> None: + boost = GetFeaturesFromText(config={}) + features_train = boost.get_feature(correct_data_path=os.getcwd() + "/data/correct/", + not_correct_data_path=os.getcwd() + "/data/not_correct/") + features_test = boost.get_feature(correct_data_path=os.getcwd() + "/data/correct_test/", + not_correct_data_path=os.getcwd() + "/data/not_correct_test/") + features_val = boost.get_feature(correct_data_path=os.getcwd() + "/data/correct_val/", + not_correct_data_path=os.getcwd() + "/data/not_correct_val/") + + df_train = pd.DataFrame(features_train["features"]) + df_test = pd.DataFrame(features_test["features"]) + df_val = pd.DataFrame(features_val["features"]) + df_train_label = features_train["label"] + df_test_label = features_test["label"] + df_val_label = features_val["label"] + + booster = CatBoostClassifier(iterations=100, verbose=10, task_type="CPU", devices="0") + + train_data = Pool(df_train, df_train_label) + test_data = Pool(df_test, df_test_label) + val_data = Pool(df_val, df_val_label) + + booster.fit(train_data, eval_set=val_data, plot=True) + + test_preds = booster.predict(test_data) + + f1_score(df_test_label, test_preds) + + with gzip.open('catboost_detect_tl_correctness.pkl.gz', 'wb') as file: + pickle.dump(booster, file) diff --git a/dedoc/scripts/train/train_diploma_line_classifier.py b/dedoc/scripts/train/train_diploma_line_classifier.py index f25259c8..1cbf03a7 100644 --- a/dedoc/scripts/train/train_diploma_line_classifier.py +++ b/dedoc/scripts/train/train_diploma_line_classifier.py @@ -2,9 +2,8 @@ import os from typing import Optional -from dedoc.structure_extractors.feature_extractors.diploma_feature_extractor import DiplomaFeatureExtractor - from dedoc.config import _config as config +from dedoc.structure_extractors.feature_extractors.diploma_feature_extractor import DiplomaFeatureExtractor from dedoc.train_dataset.trainer.xgboost_line_classifier_trainer import XGBoostLineClassifierTrainer diff --git a/dedoc/scripts/train/train_line_metadata_classifier.py b/dedoc/scripts/train/train_line_metadata_classifier.py new file mode 100644 index 00000000..78be1534 --- /dev/null +++ b/dedoc/scripts/train/train_line_metadata_classifier.py @@ -0,0 +1,223 @@ +import argparse +import json +import os +import random +import time +import warnings +from collections import defaultdict +from itertools import chain +from typing import List, Tuple + +import numpy as np +import torch +from PIL import Image +from joblib import Parallel, delayed +from numpy import mean +from sklearn.metrics import roc_auc_score +from sklearn.model_selection import train_test_split +from torch.nn import BCELoss +from torch.nn import Sequential, Linear, ReLU, Sigmoid, BatchNorm1d +from torch.nn.modules.loss import _Loss +from torch.optim import Adam, Optimizer +from torch.utils.data import Dataset, DataLoader +from torchvision.models import resnet18, ResNet +from torchvision.transforms import ToTensor +from tqdm import tqdm + + +from dedoc.data_structures.bbox import BBox +from dedoc.readers.pdf_reader.pdf_image_reader.line_metadata_extractor.font_type_classifier import FontTypeClassifier +from dedoc.utils.image_utils import get_bbox_from_image + +parser = argparse.ArgumentParser(add_help=True) +parser.add_argument("-l", "--labels_path", type=str, help="path to the json file with labeled bboxes", required=True) +parser.add_argument("-o", "--output_file", type=str, help="name of file with trained classifier", required=True) +args = parser.parse_args() + +print("GO") + +path = args.labels_path +path_out = args.output_file +seed = 42 + +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +np.random.seed(seed) +random.seed(seed) + +device = "cuda" if torch.cuda.is_available() else "cpu" + +print(device) + + +def get_model() -> ResNet: + model = resnet18(pretrained=True) + model.fc = Sequential( + Linear(in_features=512, out_features=256), + ReLU(), + BatchNorm1d(256), + Linear(256, out_features=2), + Sigmoid(), + ) + return model + + +class FontTypeDataset(Dataset): + + def __init__(self, path: str, items: List[dict]) -> None: + super().__init__() + self.labels_list = FontTypeClassifier.labels_list + self.to_tensor = ToTensor() + + self.images = Parallel(n_jobs=8)(delayed(self._image2cropped)(path, i) for i in tqdm(items)) + self.images = [self.to_tensor(image) for image in self.images] + labels = [] + for item in items: + labels.append(self._encode_labels(item)) + self.labels = torch.tensor(labels).float() + + def _image2cropped(self, path: str, item: dict) -> Image: + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + image_path = os.path.join(path, "original_documents", item["data"]["original_document_name"]) + image = Image.open(image_path) + bbox_dict = item["data"]["bbox"]["bbox"] + bbox = BBox(x_top_left=bbox_dict["x_upper_left"], + y_top_left=bbox_dict["y_upper_left"], + height=bbox_dict["height"], + width=bbox_dict["width"] + ) + return get_bbox_from_image(image=image, bbox=bbox) + + def _encode_labels(self, item: dict) -> List[int]: + labels_item = [] + for label in self.labels_list: + if label in item["labeled"]: + labels_item.append(1) + else: + labels_item.append(0) + assert len(labels_item) == len(self.labels_list) + return labels_item + + def __getitem__(self, index: int) -> Tuple[Image, torch.Tensor]: + return self.images[index], self.labels[index] + + def __len__(self) -> int: + return len(self.labels) + + +def get_data(path: str) -> Tuple[List, List]: + grouped_tasks = defaultdict(list) + + with open(os.path.join(path, "labeled_tasks.json")) as file: + data = json.load(file) + for item in data.values(): + image = item["data"]["original_document_name"] + if os.path.isfile(os.path.join(path, "original_documents", image)): + grouped_tasks[image].append(item) + + train_group, val_group = train_test_split(list(grouped_tasks.values()), train_size=0.8, ) + train_group = list(chain(*train_group)) + val_group = list(chain(*val_group)) + return train_group, val_group + + +def one_batch_train(model: torch.nn.Module, + data_loader: DataLoader, + optimizer: Optimizer, + criterion: _Loss) -> List[float]: + epoch_losses = [] + for data_input, labels in data_loader: + optimizer.zero_grad() + + data_input = data_input.to(device) + labels = labels.float().to(device) + predictions = model(data_input) + loss = criterion(predictions, labels) + loss.backward() + optimizer.step() + epoch_losses.append(float(loss)) + return epoch_losses + + +def one_batch_val(model: torch.nn.Module, + data_loader: DataLoader, + criterion: _Loss) -> Tuple[List[float], torch.Tensor, torch.Tensor]: + epoch_losses = [] + predictions_all = [] + labels_all = [] + with torch.no_grad(): + for data_input, labels in data_loader: + data_input = data_input.to(device) + labels = labels.float().to(device) + predictions = model(data_input) + loss = criterion(predictions, labels) + epoch_losses.append(float(loss)) + predictions_all.append(predictions.cpu()) + labels_all.append(labels.cpu()) + return epoch_losses, torch.cat(predictions_all, dim=0), torch.cat(labels_all, dim=0) + + +def train_model(model: torch.nn.Module, + criterion: _Loss, + optimizer: Optimizer, + dataloaders: DataLoader, + epoch_start: int = 0, + epoch_end: int = 15) -> None: + res = [] + for epoch in range(epoch_start, epoch_end): + epoch_losses_train = one_batch_train(model, dataloaders["train"], optimizer, criterion) + epoch_losses_val, predictions_all, labels_all = one_batch_val(model, dataloaders["val"], criterion) + + roc_bold = roc_auc_score(y_score=predictions_all[:, 0], y_true=labels_all[:, 0]) + roc_other = roc_auc_score(y_score=predictions_all[:, 1], y_true=labels_all[:, 1]) + epoch_losses_train = mean(epoch_losses_train) + epoch_losses_val = mean(epoch_losses_val) + res.append((epoch, epoch_losses_train, epoch_losses_val, roc_bold, roc_other)) + report_template = "{:011d} epoch={:06d} train {:01.4f} val {:01.4f} bold {:01.4f} other {:01.4f}" + print(report_template.format(int(time.time()), *res[-1])) + return + + +def main() -> None: + train_group, val_group = get_data(path) + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + dataset_val = FontTypeDataset(path, val_group) + dataset_train = FontTypeDataset(path, train_group) + + dataloaders = { + "val": DataLoader(dataset_val, batch_size=16, drop_last=True), + "train": DataLoader(dataset_train, batch_size=16, shuffle=True, drop_last=True) + } + print("GET DATA") + + font_classifier = get_model() + print("GET MODEL") + + font_classifier.requires_grad_(False) + font_classifier.fc.requires_grad_(True) + font_classifier = font_classifier.to(device) + optimizer = Adam(params=font_classifier.fc.parameters(), lr=1e-5) + train_model(model=font_classifier, + criterion=BCELoss(), + dataloaders=dataloaders, + optimizer=optimizer, + epoch_start=0, + epoch_end=15) + + font_classifier.requires_grad_(True) + optimizer = Adam(params=font_classifier.parameters(), lr=1e-4) + train_model(model=font_classifier, + dataloaders=dataloaders, + criterion=BCELoss(), + optimizer=optimizer, + epoch_start=15, + epoch_end=35) + with open(path_out, "wb") as file_out: + font_classifier = font_classifier.cpu() + torch.save(obj=font_classifier, f=file_out) + + +if __name__ == '__main__': + main() diff --git a/dedoc/structure_constructors/concrete_structure_constructors/list_item.py b/dedoc/structure_constructors/concrete_structure_constructors/list_item.py index 362c1cc8..99f11061 100644 --- a/dedoc/structure_constructors/concrete_structure_constructors/list_item.py +++ b/dedoc/structure_constructors/concrete_structure_constructors/list_item.py @@ -1,14 +1,14 @@ from typing import List -LIST_ITEM_POINT_END_TYPE = '.' -LIST_ITEM_BRACKET_END_TYPE = ')' +LIST_ITEM_POINT_END_TYPE = "." +LIST_ITEM_BRACKET_END_TYPE = ")" class ListItem: def __init__(self, item: List[int], end: str) -> None: self.item = item - self.end_type = LIST_ITEM_BRACKET_END_TYPE if end == ')' else LIST_ITEM_POINT_END_TYPE + self.end_type = LIST_ITEM_BRACKET_END_TYPE if end == ")" else LIST_ITEM_POINT_END_TYPE def get_parent(self) -> "ListItem": parent_item = [item for item in self.item] diff --git a/dedoc/structure_constructors/concrete_structure_constructors/list_patcher.py b/dedoc/structure_constructors/concrete_structure_constructors/list_patcher.py deleted file mode 100644 index ea057e52..00000000 --- a/dedoc/structure_constructors/concrete_structure_constructors/list_patcher.py +++ /dev/null @@ -1,58 +0,0 @@ -import re -from typing import List - -from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.structure_constructors.concrete_structure_constructors.list_item import ListItem -from dedoc.data_structures.hierarchy_level import HierarchyLevel - - -class ListPatcher: - def __init__(self) -> None: - self.list_line_regexp = re.compile(r' *\d+(?:\.\d+|)*[ .)].*') - self.list_item_regexp = re.compile(r' *\d+(?:\.\d+|)*[ .)]') - - def __is_list(self, line: str) -> bool: - return self.list_line_regexp.fullmatch(line) is not None - - def __get_list_item(self, line: str) -> ListItem: - list_item = self.list_item_regexp.search(line).group(0).lstrip() - items = [int(item) for item in list_item[:-1].split(r'.') if item] - return ListItem(items, list_item[-1:]) - - def __update_line_levels(self, lines: List[LineWithMeta], list_item_line: LineWithMeta) -> None: - for line in lines: - level_1 = list_item_line.metadata.hierarchy_level.level_1 - level_2 = list_item_line.metadata.hierarchy_level.level_2 - can_be_multiline = line.metadata.hierarchy_level.can_be_multiline - paragraph_type = line.metadata.hierarchy_level.line_type - if level_1 is not None: - level = HierarchyLevel(level_1, 1 if level_2 is None else level_2 + 1, can_be_multiline, paragraph_type) - else: - level = HierarchyLevel.create_raw_text() - line.metadata.hierarchy_level = level - - def patch(self, lines: List[LineWithMeta]) -> List[LineWithMeta]: - items = [] - levels = [] - patched_lines = [] - content = [] - - for line in lines: - if not self.__is_list(line.line): - content.append(line) - continue - - item = self.__get_list_item(line.line) - - if not item.is_first_item() and items: - self.__update_line_levels(content, line) - - patched_lines.extend(content) - content = [] - - items.append(item) - levels.append(line.metadata.hierarchy_level) - patched_lines.append(line) - - patched_lines.extend(content) - return patched_lines diff --git a/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py b/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py index 04b63fc0..ed1f3277 100644 --- a/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py +++ b/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py @@ -1,14 +1,14 @@ -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple from dedoc.data_structures.document_content import DocumentContent from dedoc.data_structures.document_metadata import DocumentMetadata -from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.parsed_document import ParsedDocument from dedoc.data_structures.tree_node import TreeNode from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.structure_constructors.abstract_structure_constructor import AbstractStructureConstructor -from dedoc.data_structures.hierarchy_level import HierarchyLevel class TreeConstructor(AbstractStructureConstructor): @@ -44,9 +44,9 @@ def structure_document(self, document: UnstructuredDocument, structure_type: Opt for line in not_document_name: # add raw text line # multiline header - if (line.metadata.hierarchy_level.can_be_multiline and - line.metadata.hierarchy_level == tree.metadata.hierarchy_level and - line.metadata.hierarchy_level.line_type == tree.metadata.hierarchy_level.line_type): + hl_equal = line.metadata.hierarchy_level == tree.metadata.hierarchy_level + line_type_equal = line.metadata.hierarchy_level.line_type == tree.metadata.hierarchy_level.line_type + if line.metadata.hierarchy_level.can_be_multiline and hl_equal and line_type_equal: tree.add_text(line) # move up and add child diff --git a/dedoc/structure_constructors/structure_constructor_composition.py b/dedoc/structure_constructors/structure_constructor_composition.py index 899070f5..353b04ae 100644 --- a/dedoc/structure_constructors/structure_constructor_composition.py +++ b/dedoc/structure_constructors/structure_constructor_composition.py @@ -1,6 +1,6 @@ -from typing import Optional, Dict +from typing import Dict, Optional -from dedoc.common.exceptions.structure_extractor_exception import StructureExtractorException +from dedoc.common.exceptions.structure_extractor_error import StructureExtractorError from dedoc.data_structures.parsed_document import ParsedDocument from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.structure_constructors.abstract_structure_constructor import AbstractStructureConstructor @@ -22,10 +22,7 @@ def __init__(self, constructors: Dict[str, AbstractStructureConstructor], defaul self.default_constructor = default_constructor self.table_patcher = TablePatcher() - def structure_document(self, - document: UnstructuredDocument, - structure_type: Optional[str] = None, - parameters: Optional[dict] = None) -> ParsedDocument: + def structure_document(self, document: UnstructuredDocument, structure_type: Optional[str] = None, parameters: Optional[dict] = None) -> ParsedDocument: """ Construct the result document structure according to the `structure_type` parameter. If `structure_type` is empty string or None the default constructor will be used. @@ -42,4 +39,4 @@ def structure_document(self, if structure_type is None or structure_type == "": return self.default_constructor.structure_document(document) - raise StructureExtractorException(f"Bad structure type {structure_type}, available structure types is: {' '.join(self.constructors.keys())}") + raise StructureExtractorError(f"Bad structure type {structure_type}, available structure types is: {' '.join(self.constructors.keys())}") diff --git a/dedoc/structure_constructors/table_patcher.py b/dedoc/structure_constructors/table_patcher.py index 7e571e05..a321a0dd 100644 --- a/dedoc/structure_constructors/table_patcher.py +++ b/dedoc/structure_constructors/table_patcher.py @@ -1,11 +1,11 @@ from typing import List from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.table import Table from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_metadata import LineMetadata class TablePatcher: @@ -61,33 +61,19 @@ def _create_paragraphs_from_table(self, table: Table, hierarchy_level: int) -> L @staticmethod def _create_table_line(table: Table, hierarchy_level: int) -> LineWithMeta: - hierarchy_level_new = HierarchyLevel( - level_1=hierarchy_level + 2, # table hierarchy is lower than raw text - level_2=0, - can_be_multiline=False, - line_type="table" - ) + # table hierarchy is lower than raw text + hierarchy_level_new = HierarchyLevel(level_1=hierarchy_level + 2, level_2=0, can_be_multiline=False, line_type="table") metadata = LineMetadata(hierarchy_level=hierarchy_level_new, page_id=table.metadata.page_id, line_id=None) - return LineWithMeta(line="", metadata=metadata, annotations=[], uid="table_{}".format(table.metadata.uid)) + return LineWithMeta(line="", metadata=metadata, annotations=[], uid=f"table_{table.metadata.uid}") @staticmethod def _create_row_line(table: Table, hierarchy_level: int) -> LineWithMeta: - hierarchy_level_new = HierarchyLevel( - level_1=hierarchy_level + 3, - level_2=0, - can_be_multiline=False, - line_type="table_row" - ) + hierarchy_level_new = HierarchyLevel(level_1=hierarchy_level + 3, level_2=0, can_be_multiline=False, line_type="table_row") metadata = LineMetadata(hierarchy_level=hierarchy_level_new, page_id=table.metadata.page_id, line_id=None) return LineWithMeta(line="", metadata=metadata, annotations=[]) @staticmethod def _create_cell_line(table: Table, hierarchy_level: int, cell: str) -> LineWithMeta: - hierarchy_level_new = HierarchyLevel( - level_1=hierarchy_level + 4, - level_2=0, - can_be_multiline=False, - line_type="table_cell" - ) + hierarchy_level_new = HierarchyLevel(level_1=hierarchy_level + 4, level_2=0, can_be_multiline=False, line_type="table_cell") metadata = LineMetadata(hierarchy_level=hierarchy_level_new, page_id=table.metadata.page_id, line_id=None) return LineWithMeta(line=cell, metadata=metadata, annotations=[]) diff --git a/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py index 40c8078d..0e4eba00 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py @@ -104,9 +104,8 @@ def _postprocess_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]: def __call_builder(self, start_tag: str, lines_with_labels: List[Tuple[LineWithMeta, str]]) -> List[LineWithMeta]: for builder in self.hierarchy_level_builders: if builder.can_build(start_tag, self.hl_type): - return builder.get_lines_with_hierarchy(lines_with_labels=lines_with_labels, - init_hl_depth=self.init_hl_depth) - raise ValueError("No one can handle {} {}".format(start_tag, self.hl_type)) + return builder.get_lines_with_hierarchy(lines_with_labels=lines_with_labels, init_hl_depth=self.init_hl_depth) + raise ValueError(f"No one can handle {start_tag} {self.hl_type}") def _fix_labels(self, labels: List[str]) -> List[str]: """ @@ -138,10 +137,14 @@ def _fix_labels(self, labels: List[str]) -> List[str]: if last_body_unit is None: last_body_unit = title_end - assert title_end <= application_start, "{} <= {}".format(title_end, application_start) - assert title_end <= last_body_unit, "{} <= {}".format(title_end, last_body_unit) - assert last_body_unit <= application_start, "{} <= {}".format(last_body_unit, application_start) + assert title_end <= application_start, f"{title_end} <= {application_start}" + assert title_end <= last_body_unit, f"{title_end} <= {last_body_unit}" + assert last_body_unit <= application_start, f"{last_body_unit} <= {application_start}" + result = self.__get_result(application_start, labels, last_body_unit, title_end) + return result + + def __get_result(self, application_start: int, labels: List[str], last_body_unit: int, title_end: int) -> List[str]: result = [] for index, label in enumerate(labels): if label == "footer": @@ -173,15 +176,13 @@ def _postprocess_roman(self, hierarchy_level: HierarchyLevel, line: LineWithMeta match = LawTextFeatures.roman_regexp.match(line.line) prefix = line.line[match.start(): match.end()] suffix = line.line[match.end():] - symbols = [('T', 'I'), ('Т', 'I'), ('У', 'V'), ('П', "II"), ('Ш', "III"), ('Г', 'I')] + symbols = [("T", "I"), ("Т", "I"), ("У", "V"), ("П", "II"), ("Ш", "III"), ("Г", "I")] for symbol_from, symbol_to in symbols: prefix = prefix.replace(symbol_from, symbol_to) line.set_line(prefix + suffix) return line - def __finish_chunk(self, - is_application_begun: bool, - lines_with_labels: List[Tuple[LineWithMeta, str]]) -> List[LineWithMeta]: + def __finish_chunk(self, is_application_begun: bool, lines_with_labels: List[Tuple[LineWithMeta, str]]) -> List[LineWithMeta]: if len(lines_with_labels) == 0: return [] diff --git a/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py index faadb7e5..cf2e97ad 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/classifying_law_structure_extractor.py @@ -3,7 +3,7 @@ from abc import ABC from collections import OrderedDict from enum import Enum -from typing import List, Dict, Iterable, Optional +from typing import Dict, Iterable, List, Optional from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument @@ -13,18 +13,18 @@ class LawDocType(Enum): - decree = 'постановление' - order = 'приказ' - bylaw = 'распоряжение' - definition = 'определение' - directive = 'директива' - code = 'кодекс' - law = 'закон' - constitution = 'конституция' - edict = 'указ' - state = 'положение' - instruction = 'инструкция' - federalLaw = 'федеральный закон' + decree = "постановление" + order = "приказ" + bylaw = "распоряжение" + definition = "определение" + directive = "директива" + code = "кодекс" + law = "закон" + constitution = "конституция" + edict = "указ" + state = "положение" + instruction = "инструкция" + federal_law = "федеральный закон" @staticmethod def doc_types() -> List[str]: @@ -33,7 +33,7 @@ def doc_types() -> List[str]: LawDocType.order, LawDocType.bylaw, LawDocType.code, - LawDocType.federalLaw, + LawDocType.federal_law, LawDocType.edict, LawDocType.law, LawDocType.decree, @@ -43,7 +43,7 @@ def doc_types() -> List[str]: LawDocType.instruction] @staticmethod - def foiv_types() -> List['LawDocType']: + def foiv_types() -> List["LawDocType"]: return [LawDocType.order, LawDocType.state, LawDocType.instruction] @@ -67,42 +67,42 @@ def __init__(self, extractors: Dict[str, AbstractStructureExtractor], *, config: self.main_templates = dict() federal_law_ws = self.__add_whitespace_match("федеральный закон") - self.main_templates[LawDocType.federalLaw] = {r"\b{}\b".format(federal_law_ws)} + self.main_templates[LawDocType.federal_law] = {rf"\b{federal_law_ws}\b"} decree_ws = self.__add_whitespace_match("постановление") - self.main_templates[LawDocType.decree] = {r"\b{}\b".format(decree_ws)} + self.main_templates[LawDocType.decree] = {rf"\b{decree_ws}\b"} # Hot fix for tesseract common error order_char_map = {"з": "[з3]"} order_ws = self.__add_whitespace_match("приказ", char_map=order_char_map) - self.main_templates[LawDocType.order] = {r"\b{}\b".format(order_ws)} + self.main_templates[LawDocType.order] = {rf"\b{order_ws}\b"} bylaw_ws = self.__add_whitespace_match("распоряжение") - self.main_templates[LawDocType.bylaw] = {r"\b{}\b".format(bylaw_ws)} + self.main_templates[LawDocType.bylaw] = {rf"\b{bylaw_ws}\b"} law_ws = self.__add_whitespace_match("закон") - self.main_templates[LawDocType.law] = {r"\b{}\b".format(law_ws)} + self.main_templates[LawDocType.law] = {rf"\b{law_ws}\b"} edict_ws = self.__add_whitespace_match("указ") - self.main_templates[LawDocType.edict] = {r"\b{}\b".format(edict_ws)} + self.main_templates[LawDocType.edict] = {rf"\b{edict_ws}\b"} definition_ws = self.__add_whitespace_match("определение") - self.main_templates[LawDocType.definition] = {r"\b{}\b".format(definition_ws)} + self.main_templates[LawDocType.definition] = {rf"\b{definition_ws}\b"} directive_ws = self.__add_whitespace_match("директива") - self.main_templates[LawDocType.directive] = {r"\b{}\b".format(directive_ws)} # TODO no data + self.main_templates[LawDocType.directive] = {rf"\b{directive_ws}\b"} # TODO no data code_ws = self.__add_whitespace_match("кодекс") - self.main_templates[LawDocType.code] = {r"\b{}\b".format(code_ws)} + self.main_templates[LawDocType.code] = {rf"\b{code_ws}\b"} constitution_ws = self.__add_whitespace_match("конституция") - self.main_templates[LawDocType.constitution] = {r"\b{}\b".format(constitution_ws)} + self.main_templates[LawDocType.constitution] = {rf"\b{constitution_ws}\b"} state_ws = self.__add_whitespace_match("положение") - self.main_templates[LawDocType.state] = {r"\b{}\b".format(state_ws)} + self.main_templates[LawDocType.state] = {rf"\b{state_ws}\b"} instruction_ws = self.__add_whitespace_match("инструкция") - self.main_templates[LawDocType.instruction] = {r"\b{}\b".format(instruction_ws)} + self.main_templates[LawDocType.instruction] = {rf"\b{instruction_ws}\b"} def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> UnstructuredDocument: """ @@ -112,7 +112,7 @@ def extract_structure(self, document: UnstructuredDocument, parameters: dict) -> """ selected_extractor = self._predict_extractor(lines=document.lines) result = selected_extractor.extract_structure(document, parameters) - warning = "Use {} classifier".format(selected_extractor.document_type) + warning = f"Use {selected_extractor.document_type} classifier" result.warnings = result.warnings + [warning] return result @@ -138,7 +138,7 @@ def __type_detect(self, lines: List[str]) -> Optional[LawDocType]: for line in batch: # - for ЯМАЛО-НЕНЕЦКИЙ, \.№ for ПОСТАНОВЛЕНИЕ от 1.1.2000 № 34 # / for Приказ № 47/823 от 17.12.2013 г. - if re.fullmatch(r'[\s\w-]*' + template + r'[()/\.№\s\w-]*', line, re.IGNORECASE): + if re.fullmatch(r"[\s\w-]*" + template + r"[()/\.№\s\w-]*", line, re.IGNORECASE): if doc_type is LawDocType.law: law_matched = True else: @@ -150,20 +150,18 @@ def __type_detect(self, lines: List[str]) -> Optional[LawDocType]: def __get_extractor_by_type(self, doc_type: Optional[LawDocType]) -> AbstractStructureExtractor: if doc_type is None: - self.logger.info("Dynamic document type not found, using base: {}".format( - LawStructureExtractor.document_type)) + self.logger.info(f"Dynamic document type not found, using base: {LawStructureExtractor.document_type}") return self.extractors[LawStructureExtractor.document_type] elif doc_type in LawDocType.foiv_types(): if FoivLawStructureExtractor.document_type in self.extractors: - self.logger.info("Dynamic document type predicted: {}".format( - FoivLawStructureExtractor.document_type)) + self.logger.info(f"Dynamic document type predicted: {FoivLawStructureExtractor.document_type}") return self.extractors[FoivLawStructureExtractor.document_type] else: - self.logger.warning("No classifier for predicted dynamic document type {}, using {}".format( - FoivLawStructureExtractor.document_type, LawStructureExtractor.document_type)) + self.logger.warning(f"No classifier for predicted dynamic document type {FoivLawStructureExtractor.document_type}, " + f"using {LawStructureExtractor.document_type}") return self.extractors[LawStructureExtractor.document_type] else: - self.logger.info("Dynamic document type predicted: {}".format(LawStructureExtractor.document_type)) + self.logger.info(f"Dynamic document type predicted: {LawStructureExtractor.document_type}") return self.extractors[LawStructureExtractor.document_type] def __add_whitespace_match(self, pattern: Iterable, char_map: dict = None) -> str: @@ -199,8 +197,7 @@ def __create_line_batches(self, lines: List[str], batch_size: int, batch_count: return batch_lines def __text_clean(self, text: str) -> str: - bad_characters = OrderedDict({"\u0438\u0306": "й", "\u0439\u0306": "й", - "\u0418\u0306": "Й", "\u0419\u0306": "Й"}) + bad_characters = OrderedDict({"\u0438\u0306": "й", "\u0439\u0306": "й", "\u0418\u0306": "Й", "\u0419\u0306": "Й"}) for bad_c, good_c in bad_characters.items(): text = text.replace(bad_c, good_c) return text diff --git a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py index dee5c2cc..4ac9b075 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py @@ -1,4 +1,4 @@ -from typing import Optional, List +from typing import List, Optional from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta @@ -72,8 +72,8 @@ def get_list_hl_with_regexp(line: LineWithMeta, previous_line: Optional[LineWith if prefix.name == BracketPrefix.name: # list like 1) # check if tesseract recognize russian б as 6 (bi as six) - if (prefix.prefix_num == 6 and previous_line is not None and - previous_line.line.lower().strip().startswith(("a)", "а)"))): # here is russian and english letters + if prefix.prefix_num == 6 and previous_line is not None and \ + previous_line.line.lower().strip().startswith(("a)", "а)")): # here is russian and english letters return HierarchyLevel(4, 1, False, line_type=HierarchyLevel.list_item) return HierarchyLevel(3, 1, False, line_type=HierarchyLevel.list_item) diff --git a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py index f522b385..09c2c9eb 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py @@ -6,7 +6,6 @@ from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor - from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TOCFeatureExtractor from dedoc.structure_extractors.hierarchy_level_builders.diploma_builder.body_builder import DiplomaBodyBuilder from dedoc.structure_extractors.hierarchy_level_builders.header_builder.header_hierarchy_level_builder import HeaderHierarchyLevelBuilder diff --git a/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py index e45f01df..e5008a50 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/foiv_law_structure_extractor.py @@ -5,8 +5,7 @@ from dedoc.structure_extractors.hierarchy_level_builders.header_builder.header_hierarchy_level_builder import HeaderHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.law_builders.application_builder.application_foiv_hierarchy_level_builder import \ ApplicationFoivHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.body_foiv_hierarchy_level_builder import \ - BodyFoivHierarchyLevelBuilder +from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.body_foiv_hierarchy_level_builder import BodyFoivHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.law_builders.cellar_builder import CellarHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_ends_of_number, regexps_foiv_item diff --git a/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py b/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py index afc73acd..b46f8c7d 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/law_structure_excractor.py @@ -2,16 +2,13 @@ from typing import List from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.structure_extractors.concrete_structure_extractors.abstract_law_structure_extractor import \ - AbstractLawStructureExtractor -from dedoc.structure_extractors.hierarchy_level_builders.header_builder.header_hierarchy_level_builder import \ - HeaderHierarchyLevelBuilder +from dedoc.structure_extractors.concrete_structure_extractors.abstract_law_structure_extractor import AbstractLawStructureExtractor +from dedoc.structure_extractors.hierarchy_level_builders.header_builder.header_hierarchy_level_builder import HeaderHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.law_builders.application_builder.application_law_hierarchy_level_builder import \ ApplicationLawHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.body_law_hierarchy_level_builder import \ - BodyLawHierarchyLevelBuilder +from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.body_law_hierarchy_level_builder import BodyLawHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.law_builders.cellar_builder import CellarHierarchyLevelBuilder -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_number, regexps_ends_of_number +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_ends_of_number, regexps_number class LawStructureExtractor(AbstractLawStructureExtractor): @@ -28,9 +25,9 @@ def __init__(self, *, config: dict) -> None: BodyLawHierarchyLevelBuilder(), CellarHierarchyLevelBuilder(), ApplicationLawHierarchyLevelBuilder()] - self.regexps_item = re.compile(r'^\s*(\d*\.)*\d+[\)|\}]') + self.regexps_item = re.compile(r"^\s*(\d*\.)*\d+[\)|\}]") self.regexps_part = regexps_number - self.regexps_subitem = re.compile(r'^\s*[а-яё]\)') + self.regexps_subitem = re.compile(r"^\s*[а-яё]\)") self.regexps_ends_of_number = regexps_ends_of_number self.init_hl_depth = 2 self.hl_type = "law" diff --git a/dedoc/structure_extractors/feature_extractors/abstract_extractor.py b/dedoc/structure_extractors/feature_extractors/abstract_extractor.py index 0ddd3bd2..161ed9d9 100644 --- a/dedoc/structure_extractors/feature_extractors/abstract_extractor.py +++ b/dedoc/structure_extractors/feature_extractors/abstract_extractor.py @@ -1,6 +1,6 @@ import json from abc import ABC, abstractmethod -from typing import List, Pattern, Iterable, Optional, Tuple +from typing import Iterable, List, Optional, Pattern, Tuple import numpy as np import pandas as pd @@ -14,8 +14,8 @@ from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation from dedoc.data_structures.concrete_annotations.underlined_annotation import UnderlinedAnnotation from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import \ - regexps_number, regexps_ends_of_number, regexps_subitem_extended, regexps_subitem, regexps_year +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_ends_of_number, regexps_number, regexps_subitem, regexps_subitem_extended, \ + regexps_year from dedoc.utils.utils import list_get @@ -60,33 +60,30 @@ def _next_line_features(feature_matrix: np.ndarray, n: int) -> np.ndarray: @staticmethod def _create_features_name(old_names: pd.Index, which: str, num: int) -> List[str]: - return ["{}_{}_{}".format(old_name, which, num) for old_name in old_names] + return [f"{old_name}_{which}_{num}" for old_name in old_names] def prev_next_line_features(self, matrix: pd.DataFrame, n_prev: int, n_next: int) -> pd.DataFrame: """ add previous and next features with their names """ feature_names = matrix.columns - prev_line_features = [pd.DataFrame(data=self._prev_line_features(matrix.values, i), - columns=self._create_features_name(feature_names, "prev", i)) + prev_line_features = [pd.DataFrame(data=self._prev_line_features(matrix.values, i), columns=self._create_features_name(feature_names, "prev", i)) for i in range(1, n_prev + 1)] - next_line_features = [pd.DataFrame(data=self._next_line_features(matrix.values, i), - columns=self._create_features_name(feature_names, "next", i)) + next_line_features = [pd.DataFrame(data=self._next_line_features(matrix.values, i), columns=self._create_features_name(feature_names, "next", i)) for i in range(1, n_next + 1)] matrices = [matrix] + prev_line_features + next_line_features result_matrix = pd.concat(matrices, axis=1) return result_matrix - def _start_regexp(self, line: str, regexps: List[Pattern], - suffix: Optional[str] = None) -> Iterable[Tuple[str, float]]: + def _start_regexp(self, line: str, regexps: List[Pattern], suffix: Optional[str] = None) -> Iterable[Tuple[str, float]]: matches = 0 text = line.strip() for i, pattern in enumerate(regexps): # list patterns if suffix is None: - feature_name = "start_regexp_{}".format(i) + feature_name = f"start_regexp_{i}" else: - feature_name = "start_regexp_{}_{}".format(i, suffix) + feature_name = f"start_regexp_{i}_{suffix}" match = pattern.match(text) if match is not None and match.end() > 0: matches += 1 @@ -96,7 +93,7 @@ def _start_regexp(self, line: str, regexps: List[Pattern], if suffix is None: yield "start_regexp_num_matches", matches else: - yield "start_regexp_num_matches_{}".format(suffix), matches + yield f"start_regexp_num_matches_{suffix}", matches @staticmethod def _get_size(line: LineWithMeta) -> float: @@ -105,8 +102,7 @@ def _get_size(line: LineWithMeta) -> float: @staticmethod def _get_bold(line: LineWithMeta) -> float: - bold = [annotation for annotation in line.annotations - if annotation.name == BoldAnnotation.name and annotation.value == "True"] + bold = [annotation for annotation in line.annotations if annotation.name == BoldAnnotation.name and annotation.value == "True"] return 1. if len(bold) > 0 else 0 @staticmethod @@ -186,7 +182,7 @@ def _can_be_prev_element(this_item: Optional[str], prev_item: Optional[str]) -> return prev_item_list == this_item_prefix and this_item_list[-1] == "1" if len(prev_item_list) == len(this_item_list): return prev_item_prefix == this_item_prefix and int(this_item_list[-1]) - int(prev_item_list[-1]) == 1 - raise Exception("Unexpected case where this_item = {} prev_item = {}".format(this_item, prev_item)) + raise Exception(f"Unexpected case where this_item = {this_item} prev_item = {prev_item}") def _before_special_line(self, document: List[LineWithMeta], find_special_line: method) -> List[float]: """ @@ -202,7 +198,7 @@ def _before_special_line(self, document: List[LineWithMeta], find_special_line: result.extend([0. for _ in document]) else: special_line_id = special_line_position[-1] - for line_id, line in enumerate(document): + for line_id in range(len(document)): result.append(line_id - special_line_id) return result @@ -216,7 +212,7 @@ def _list_features(self, lines: List[LineWithMeta]) -> List[float]: searched = self.ends_of_number.search(number) if searched is not None: numbers[num] = (line_id, number[:searched.start()]) - if number.endswith((')', '}', '.')): + if number.endswith((")", "}", ".")): numbers[num] = (line_id, number[:-1]) if len(numbers) == 0: @@ -257,6 +253,5 @@ def _normalize_features(feature_column: pd.Series) -> pd.Series: Output: normalized feature vector [-1; 1] """ feature_mean, feature_min, feature_max = feature_column.mean(), feature_column.min(), feature_column.max() - new_feature_column = (feature_column - feature_mean) / (feature_max - feature_min) if \ - feature_max - feature_min != 0.0 else 0.0 + new_feature_column = (feature_column - feature_mean) / (feature_max - feature_min) if feature_max - feature_min != 0.0 else 0.0 return new_feature_column diff --git a/dedoc/structure_extractors/feature_extractors/diploma_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/diploma_feature_extractor.py index 6d02e32d..504b0934 100644 --- a/dedoc/structure_extractors/feature_extractors/diploma_feature_extractor.py +++ b/dedoc/structure_extractors/feature_extractors/diploma_feature_extractor.py @@ -1,6 +1,6 @@ import re from collections import defaultdict -from typing import List, Tuple, Optional, Iterator +from typing import Iterator, List, Optional, Tuple import pandas as pd from Levenshtein import ratio @@ -12,7 +12,7 @@ from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TOCFeatureExtractor from dedoc.structure_extractors.feature_extractors.utils_feature_extractor import normalization_by_min_max -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_item, regexps_digits_with_dots +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_digits_with_dots, regexps_item class DiplomaFeatureExtractor(AbstractFeatureExtractor): @@ -53,8 +53,7 @@ def transform(self, toc_lines: Optional[List[List[LineWithMeta]]] = None) -> pd.DataFrame: toc_lines = [[] for _ in documents] if toc_lines is None else toc_lines assert len(toc_lines) == len(documents) - result_matrix = pd.concat([self.__process_document(document, d_toc_lines) for document, d_toc_lines in zip(documents, toc_lines)], - ignore_index=True) + result_matrix = pd.concat([self.__process_document(document, d_toc_lines) for document, d_toc_lines in zip(documents, toc_lines)], ignore_index=True) features = sorted(result_matrix.columns) return result_matrix[features].astype(float) @@ -64,7 +63,7 @@ def __process_document(self, lines: List[LineWithMeta], toc_lines: Optional[List features_df["list_item"] = self._list_features(lines) one_line_features_dict = defaultdict(list) - for line_id, line in enumerate(lines): + for line in lines: for item in self._one_line_features(line, len(lines), toc_lines): feature_name, feature = item[0], item[1] one_line_features_dict[feature_name].append(feature) diff --git a/dedoc/structure_extractors/feature_extractors/first_word_features.py b/dedoc/structure_extractors/feature_extractors/first_word_features.py index 42743a25..73c8b241 100644 --- a/dedoc/structure_extractors/feature_extractors/first_word_features.py +++ b/dedoc/structure_extractors/feature_extractors/first_word_features.py @@ -1,4 +1,4 @@ -from typing import List, Iterable, Optional +from typing import Iterable, List, Optional import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer diff --git a/dedoc/structure_extractors/feature_extractors/law_text_features.py b/dedoc/structure_extractors/feature_extractors/law_text_features.py index f3c5331b..2acbb3ae 100644 --- a/dedoc/structure_extractors/feature_extractors/law_text_features.py +++ b/dedoc/structure_extractors/feature_extractors/law_text_features.py @@ -1,13 +1,12 @@ import re from collections import defaultdict -from typing import List, Iterator, Optional, Tuple, Dict +from typing import Dict, Iterator, List, Optional, Tuple import pandas as pd from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor -from dedoc.structure_extractors.feature_extractors.list_features.list_features_extractor import \ - ListFeaturesExtractor +from dedoc.structure_extractors.feature_extractors.list_features.list_features_extractor import ListFeaturesExtractor from dedoc.structure_extractors.feature_extractors.utils_feature_extractor import normalization_by_min_max @@ -25,12 +24,12 @@ class LawTextFeatures(AbstractFeatureExtractor): r"((к распоряжению)|(к постановлению)|(к приказу))?\s*$" ) regexps_items = [ - re.compile(r'^\s*(\d{1,3}\.)+\s*[a-zA-Zа-яА-ЯёЁ]'), - re.compile(r'^\s*\d{1,3}(\)|\})'), + re.compile(r"^\s*(\d{1,3}\.)+\s*[a-zA-Zа-яА-ЯёЁ]"), + re.compile(r"^\s*\d{1,3}(\)|\})"), ] # 12 regexps_subitem = [ - re.compile(r'^\s*[а-яё]\)'), + re.compile(r"^\s*[а-яё]\)"), ] quote_start = re.compile(r"^([\"'«])") quote_end = re.compile(r".*[\"'»][.;]?$") @@ -50,8 +49,7 @@ def fit(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] = None def transform(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] = None) -> pd.DataFrame: assert len(documents) > 0 - result_matrix = pd.concat([self.__process_document(document) for document in documents], - ignore_index=True) + result_matrix = pd.concat([self.__process_document(document) for document in documents], ignore_index=True) list_features = self.list_feature_extractor.transform(documents) result_matrix = pd.concat([result_matrix, list_features], axis=1) features = sorted(result_matrix.columns) @@ -71,11 +69,8 @@ def __process_document(self, lines: List[LineWithMeta], with_prev_next: bool = T one_line_features_dict = defaultdict(list) - for line_id, line in enumerate(lines): - for item in self._one_line_features(line, - total_lines=len(lines), - start_page=start_page, - finish_page=finish_page): + for line in lines: + for item in self._one_line_features(line, total_lines=len(lines), start_page=start_page, finish_page=finish_page): feature_name, feature = item[0], item[1] one_line_features_dict[feature_name].append(feature) one_line_features_df = pd.DataFrame(one_line_features_dict) @@ -88,8 +83,8 @@ def __process_document(self, lines: List[LineWithMeta], with_prev_next: bool = T one_line_features_df = self.prev_next_line_features(one_line_features_df, 3, 3) result_matrix = pd.concat([one_line_features_df, features_df], axis=1) - '''for feature in result_matrix.keys(): - result_matrix[feature] = self._normalize_features(result_matrix[feature])''' + """for feature in result_matrix.keys(): + result_matrix[feature] = self._normalize_features(result_matrix[feature])""" return result_matrix def _look_at_prev_line(self, document: List[LineWithMeta], n: int = 1) -> Dict[str, List]: @@ -110,7 +105,7 @@ def _look_at_prev_line(self, document: List[LineWithMeta], n: int = 1) -> Dict[s if line_id >= n: prev_line = document[line_id - n] - is_prev_line_ends = prev_line.line.endswith(('.', ';')) + is_prev_line_ends = prev_line.line.endswith((".", ";")) res["prev_line_ends"].append(1 if is_prev_line_ends else 0) res["prev_ends_with_colon"].append(prev_line.line.endswith(":")) res["prev_starts_with_article"].append(prev_line.line.lower().strip().startswith("статья")) @@ -135,13 +130,13 @@ def _one_line_features(self, line: LineWithMeta, total_lines: int, start_page: i yield "endswith_semicolon", float(line.line.strip().endswith(";")) yield "endswith_colon", float(line.line.strip().endswith(":")) yield "endswith_comma", float(line.line.strip().endswith(",")) - yield "startswith_bracket", float(line.line.strip().startswith(('(', '{'))) + yield "startswith_bracket", float(line.line.strip().startswith(("(", "{"))) bracket_cnt = 0 for char in line.line: - if char == '(': + if char == "(": bracket_cnt += 1 - elif char == ')': + elif char == ")": bracket_cnt = max(0, bracket_cnt - 1) yield "bracket_num", bracket_cnt @@ -172,7 +167,7 @@ def _one_line_features(self, line: LineWithMeta, total_lines: int, start_page: i match = regexp.match(line.line) if match: yield "subitem_regexp_len", len(match.group()) - yield "subitem_regexp_num", ord(match.group().strip()[:-1]) - ord('а') + yield "subitem_regexp_num", ord(match.group().strip()[:-1]) - ord("а") else: yield "subitem_regexp_len", 0 yield "subitem_regexp_num", 0 @@ -214,7 +209,7 @@ def _inside_quotes(self, lines: List[LineWithMeta]) -> List[int]: match = self.quote_start.match(text) if match is not None and self.__any_item_found(text[1:]): # quotation started match = "»" if match.group() == "«" else match.group() - quote_end = re.compile(r".*{}[.;]?$".format(match)) + quote_end = re.compile(rf".*{match}[.;]?$") if quote_end.match(text) is None: quote_started = True new_quote.append(1) diff --git a/dedoc/structure_extractors/feature_extractors/list_features/list_features_extractor.py b/dedoc/structure_extractors/feature_extractors/list_features/list_features_extractor.py index fbfc18b3..8c392ef0 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/list_features_extractor.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/list_features_extractor.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import List, Optional, Dict, Tuple +from typing import Dict, List, Optional, Tuple import numpy as np import pandas as pd @@ -8,9 +8,9 @@ from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_prefix from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix +from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix from dedoc.structure_extractors.feature_extractors.list_features.prefix.letter_prefix import LetterPrefix -from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix from dedoc.structure_extractors.feature_extractors.list_features.prefix.prefix import LinePrefix @@ -26,8 +26,7 @@ class ListFeaturesExtractor(AbstractFeatureExtractor): def __init__(self, window_size: int = 25, prefix_list: Optional[List[LinePrefix]] = None) -> None: super().__init__() self.window_size = window_size - self.prefix_list = prefix_list if prefix_list is not None \ - else [BulletPrefix, LetterPrefix, BracketPrefix, DottedPrefix] + self.prefix_list = prefix_list if prefix_list is not None else [BulletPrefix, LetterPrefix, BracketPrefix, DottedPrefix] def parameters(self) -> dict: return {"window_size": self.window_size} @@ -39,8 +38,7 @@ def transform(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] features = [self.one_document(doc)[1] for doc in documents] return pd.concat(features, axis=0, ignore_index=True) - def one_document(self, - doc: List[LineWithMeta]) -> Tuple[List[LinePrefix], pd.DataFrame]: + def one_document(self, doc: List[LineWithMeta]) -> Tuple[List[LinePrefix], pd.DataFrame]: prefixes = [self._get_prefix(line) for line in doc] indents = np.array([prefix.indent for prefix in prefixes]) res = [] @@ -55,20 +53,14 @@ def one_document(self, features_dict[feature_name].append(feature_value) return prefixes, pd.DataFrame(features_dict) - def _one_line_features(self, - line: LineWithMeta, - prefix: LinePrefix, - line_id: int, - window: Window) -> Dict[str, float]: + def _one_line_features(self, line: LineWithMeta, prefix: LinePrefix, line_id: int, window: Window) -> Dict[str, float]: predecessor_num = 0 predecessor_num_same_indent = 0 same_indent = 0 same_prefix = 0 for prefix_other in window.prefix_before + window.prefix_after: is_predecessor = prefix.predecessor(prefix_other) or prefix.successor(prefix_other) - is_same_indent = self._same_indent(this_indent=prefix.indent, - other_indent=prefix_other.indent, - std=window.indent_std) + is_same_indent = self._same_indent(this_indent=prefix.indent, other_indent=prefix_other.indent, std=window.indent_std) predecessor_num += is_predecessor same_indent += is_same_indent predecessor_num_same_indent += (is_same_indent and is_predecessor) @@ -78,9 +70,9 @@ def _one_line_features(self, same_indent /= window_size predecessor_num_same_indent /= window_size predecessor_num /= window_size - return {"same_indent_{}".format(self.window_size): same_indent, - "predecessor_num_same_indent_{}".format(self.window_size): predecessor_num_same_indent, - "predecessor_num_{}".format(self.window_size): predecessor_num} + return {f"same_indent_{self.window_size}": same_indent, + f"predecessor_num_same_indent_{self.window_size}": predecessor_num_same_indent, + f"predecessor_num_{self.window_size}": predecessor_num} def _same_indent(self, this_indent: float, other_indent: float, std: float) -> bool: eps = 1 diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/any_letter_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/any_letter_prefix.py index f58562fb..f7e4e457 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/any_letter_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/any_letter_prefix.py @@ -18,7 +18,7 @@ class AnyLetterPrefix(LinePrefix): """ name = "any_letter" - regexp = re.compile(r'^\s*\w\)') + regexp = re.compile(r"^\s*\w\)") def predecessor(self, other: "LinePrefix") -> bool: return isinstance(other, AnyLetterPrefix) diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_prefix.py index df0e1040..c9a67d90 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_prefix.py @@ -11,7 +11,7 @@ class BracketPrefix(LinePrefix): 2) second element """ - regexp = re.compile(r'^\s*\d\)') + regexp = re.compile(r"^\s*\d\)") name = "bracket" def __init__(self, prefix: str, indent: float) -> None: diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_roman_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_roman_prefix.py index b6ce0b26..d71a6de9 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_roman_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_roman_prefix.py @@ -1,4 +1,5 @@ import re + import roman from dedoc.structure_extractors.feature_extractors.list_features.prefix.prefix import LinePrefix @@ -14,7 +15,7 @@ class BracketRomanPrefix(LinePrefix): iv) forth item """ - regexp = re.compile(r'^\s*[ivxl]\)') + regexp = re.compile(r"^\s*[ivxl]\)") name = "roman" def __init__(self, prefix: str, indent: float) -> None: @@ -29,4 +30,4 @@ def is_valid(prefix_str: str) -> bool: if len(prefix_str) <= 1 or not prefix_str.endswith(")"): return False prefix_set = set(prefix_str[:-1]) - return prefix_set.issubset(set('ivxl')) + return prefix_set.issubset(set("ivxl")) diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/dotted_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/dotted_prefix.py index 7f7723d7..f81991c7 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/dotted_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/dotted_prefix.py @@ -5,7 +5,7 @@ class DottedPrefix(LinePrefix): - regexp = re.compile(r'^\s*(\d+\.)+(\d+)?\s*') + regexp = re.compile(r"^\s*(\d+\.)+(\d+)?\s*") name = "dotted" def __init__(self, prefix: str, indent: float) -> None: diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/letter_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/letter_prefix.py index 8558c5a2..6ad602ab 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/letter_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/letter_prefix.py @@ -16,16 +16,16 @@ class LetterPrefix(LinePrefix): в) Kalt wie das Eis """ - regexp = re.compile(r'^\s*[а-яёa-z]\)') + regexp = re.compile(r"^\s*[а-яёa-z]\)") name = "letter" @property def order(self) -> float: letter = self.prefix[0] - if letter == "ё": # ё is between е and ж, but ord('ё') is not between them - return 0.5 * (ord('е') + ord('ж')) - elif letter == "Ё": # Ё is between Е and Ж, but ord('Ё') is not between them - return 0.5 * (ord('Е') + ord('Ж')) + if letter == "ё": # ё is between е and ж, but ord("ё") is not between them + return 0.5 * (ord("е") + ord("ж")) + elif letter == "Ё": # Ё is between Е and Ж, but ord("Ё") is not between them + return 0.5 * (ord("Е") + ord("Ж")) else: return ord(letter) diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/prefix.py index 3057bdeb..5869d086 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/prefix.py @@ -30,7 +30,7 @@ class LinePrefix(abc.ABC): regexp = None def __init__(self, prefix: str, indent: float) -> None: - assert self.is_valid(prefix), "`{}` is invalid prefix for this {} type".format(prefix, self.name) + assert self.is_valid(prefix), f"`{prefix}` is invalid prefix for this {self.name} type" self.prefix = prefix self.indent = indent @@ -70,14 +70,13 @@ def is_valid(prefix_str: str) -> bool: """ returns true if prefix_str is valid for this type of prefix, false otherwise. :param prefix_str: the string representation of the prefix - For example '1.' is valid for DottedPrefix + For example "1." is valid for DottedPrefix :return: """ pass def __str__(self) -> str: - name = self.__class__.__name__ - return "{}({})".format(name, self.prefix) + return f"{self.__class__.__name__}({self.prefix})" def __repr__(self) -> str: return self.__str__() diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/roman_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/roman_prefix.py index b53b4c81..cca57f89 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/roman_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/roman_prefix.py @@ -1,4 +1,5 @@ import re + import roman from dedoc.structure_extractors.feature_extractors.list_features.prefix.prefix import LinePrefix @@ -14,7 +15,7 @@ class RomanPrefix(LinePrefix): IV. forth item """ - regexp = re.compile(r'^\s*[ivxl]\.') + regexp = re.compile(r"^\s*[ivxl]\.") name = "roman" def __init__(self, prefix: str, indent: float) -> None: @@ -30,4 +31,4 @@ def is_valid(prefix_str: str) -> bool: if len(prefix_str) <= 1 or not prefix_str.endswith("."): return False prefix_set = set(prefix_str[:-1]) - return prefix_set.issubset(set('ivxl')) + return prefix_set.issubset(set("ivxl")) diff --git a/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py index 9c3b8537..6db29463 100644 --- a/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py +++ b/dedoc/structure_extractors/feature_extractors/toc_feature_extractor.py @@ -1,7 +1,9 @@ import re from typing import List, Optional, Tuple, Union + import numpy as np from Levenshtein._levenshtein import ratio + from dedoc.data_structures.line_with_meta import LineWithMeta @@ -59,8 +61,9 @@ def __get_raw_result(self, corrected_lines: np.ndarray, len_lines: int, marks: n if sum(marks[:idx]) > 5 and not np.any(marks[idx: idx + self.window_size]): corrected_marks.extend([False] * (len_lines - self.window_size - idx)) break - corrected_marks.append(np.any(marks[idx: idx + self.window_size]) and np.any(marks[:idx]) or - marks[idx] and np.any(marks[idx + 1: idx + self.window_size])) + marked_before = np.any(marks[idx: idx + self.window_size]) and np.any(marks[:idx]) + marked_after = marks[idx] and np.any(marks[idx + 1: idx + self.window_size]) + corrected_marks.append(marked_before or marked_after) corrected_marks.extend([False] * self.window_size) result = list(corrected_lines[corrected_marks]) return result @@ -75,7 +78,7 @@ def __get_probable_toc(self, document: List[LineWithMeta]) -> Tuple[List[Union[d corrected_lines = [] # First step: we check each line with regular expressions and find the TOC title and TOC items # We filter too short probable TOCs (< 6 TOC items) or too long probable TOC items (> 5 lines long) - for i, line in enumerate(document): + for line in document: line_text = line.line # check if the line is a TOC title @@ -112,7 +115,7 @@ def __check_page_order(self, corrected_result: List[dict]) -> bool: Second TOC item ... 2 Third TOC item .... 5 """ - assert(len(corrected_result) > 1) + assert len(corrected_result) > 1 right_page_order = True prev_page = int(corrected_result[0]["page"]) for item in corrected_result[1:]: diff --git a/dedoc/structure_extractors/feature_extractors/tz_feature_extractor.py b/dedoc/structure_extractors/feature_extractors/tz_feature_extractor.py index 0d6af840..d46ff663 100644 --- a/dedoc/structure_extractors/feature_extractors/tz_feature_extractor.py +++ b/dedoc/structure_extractors/feature_extractors/tz_feature_extractor.py @@ -1,6 +1,6 @@ import re -from collections import defaultdict, Counter -from typing import List, Iterable, Tuple, Optional, Iterator +from collections import Counter, defaultdict +from typing import Iterable, Iterator, List, Optional, Tuple import pandas as pd @@ -43,8 +43,7 @@ def fit(self, documents: List[LineWithMeta], y: Optional[List[str]] = None) -> " def transform(self, documents: List[List[LineWithMeta]], y: Optional[List[str]] = None) -> pd.DataFrame: list_features = self.list_feature_extractor.transform(documents) result_matrix = pd.concat([self.__process_document(document) for document in documents], ignore_index=True) - result_matrix["is_in_toc"] = list( - flatten(self.toc_extractor.is_line_in_toc(document) for document in documents)) + result_matrix["is_in_toc"] = list(flatten(self.toc_extractor.is_line_in_toc(document) for document in documents)) result_matrix = pd.concat([result_matrix, list_features], axis=1) features = sorted(result_matrix.columns) cnt = Counter(features) @@ -65,7 +64,7 @@ def __process_document(self, lines: List[LineWithMeta]) -> pd.DataFrame: start_page, finish_page = 0, 0 one_line_features_dict = defaultdict(list) - for line_id, line in enumerate(lines): + for line in lines: for item in self._one_line_features(line, len(lines), start_page=start_page, finish_page=finish_page): feature_name, feature = item[0], item[1] one_line_features_dict[feature_name].append(feature) @@ -77,11 +76,7 @@ def __process_document(self, lines: List[LineWithMeta]) -> pd.DataFrame: result_matrix = pd.concat([one_line_features_df, features_df], axis=1) return result_matrix - def _one_line_features(self, - line: LineWithMeta, - total_lines: int, - start_page: int, - finish_page: int) -> Iterator[Tuple[str, int]]: + def _one_line_features(self, line: LineWithMeta, total_lines: int, start_page: int, finish_page: int) -> Iterator[Tuple[str, int]]: text = line.line.lower() yield from self._start_regexp(line.line, self.list_item_regexp) @@ -93,7 +88,7 @@ def _one_line_features(self, number = self.number_regexp.match(text) number = number.group().strip() if number else "" - if number.endswith((')', '}')): + if number.endswith((")", "}")): number = number[:-1] yield ("dot_number_regexp", 1) if number.endswith(".") else ("dot_number_regexp", 0) yield "dot_number_regexp_len", len(number.split(".")) diff --git a/dedoc/structure_extractors/hierarchy_level_builders/abstract_hierarchy_level_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/abstract_hierarchy_level_builder.py index e2571c60..e4123fa4 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/abstract_hierarchy_level_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/abstract_hierarchy_level_builder.py @@ -43,7 +43,7 @@ def _postprocess_roman(hierarchy_level: HierarchyLevel, line: LineWithMeta) -> L match = LawTextFeatures.roman_regexp.match(line.line) prefix = line.line[match.start(): match.end()] suffix = line.line[match.end():] - symbols = [('T', 'I'), ('Т', 'I'), ('У', 'V'), ('П', "II"), ('Ш', "III"), ('Г', 'I')] + symbols = [("T", "I"), ("Т", "I"), ("У", "V"), ("П", "II"), ("Ш", "III"), ("Г", "I")] for symbol_from, symbol_to in symbols: prefix = prefix.replace(symbol_from, symbol_to) line.set_line(prefix + suffix) diff --git a/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py index af89c60e..85f3006d 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py @@ -1,4 +1,4 @@ -from typing import Tuple, List +from typing import List, Tuple from dedoc.data_structures import BoldAnnotation from dedoc.data_structures.hierarchy_level import HierarchyLevel @@ -18,9 +18,7 @@ def __int__(self) -> None: super().__init__() self.digits_with_dots_regexp = regexps_digits_with_dots - def get_lines_with_hierarchy(self, - lines_with_labels: List[Tuple[LineWithMeta, str]], - init_hl_depth: int) -> List[LineWithMeta]: + def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, str]], init_hl_depth: int) -> List[LineWithMeta]: if len(lines_with_labels) > 0: line = lines_with_labels[0][0] page_id = line.metadata.page_id diff --git a/dedoc/structure_extractors/hierarchy_level_builders/header_builder/header_hierarchy_level_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/header_builder/header_hierarchy_level_builder.py index b524accc..f1f6b236 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/header_builder/header_hierarchy_level_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/header_builder/header_hierarchy_level_builder.py @@ -1,5 +1,5 @@ from copy import deepcopy -from typing import Tuple, Optional, List +from typing import List, Optional, Tuple from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta @@ -25,20 +25,12 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s for line, label in lines_with_labels: # postprocessing of others units - hierarchy_level, previous_hl = self._line_2level(text=line.line, - label=label, - init_hl_depth=init_hl_depth, - previous_hl=previous_hl) + hierarchy_level, previous_hl = self._line_2level(text=line.line, label=label, init_hl_depth=init_hl_depth, previous_hl=previous_hl) self._postprocess_roman(hierarchy_level, line) metadata = deepcopy(line.metadata) metadata.hierarchy_level = hierarchy_level - line = LineWithMeta( - line=line.line, - metadata=metadata, - annotations=line.annotations, - uid=line.uid - ) + line = LineWithMeta(line=line.line, metadata=metadata, annotations=line.annotations, uid=line.uid) result.append(line) return result diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/application_builder/abstract_application_hierarchy_level_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/application_builder/abstract_application_hierarchy_level_builder.py index 5908bbbc..9f856cb5 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/application_builder/abstract_application_hierarchy_level_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/application_builder/abstract_application_hierarchy_level_builder.py @@ -1,7 +1,7 @@ import abc import copy from copy import deepcopy -from typing import Tuple, Optional, List +from typing import List, Optional, Tuple from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta @@ -22,26 +22,18 @@ class AbstractApplicationHierarchyLevelBuilder(AbstractHierarchyLevelBuilder, ab def structure_unit_builder(self) -> AbstractStructureUnit: pass - def get_lines_with_hierarchy(self, - lines_with_labels: List[Tuple[LineWithMeta, str]], - init_hl_depth: int) -> List[LineWithMeta]: + def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, str]], init_hl_depth: int) -> List[LineWithMeta]: if len(lines_with_labels) == 0: return [] result = [] # detect begin of body - previous_hl = HierarchyLevel(level_1=init_hl_depth, # 2 - level_2=0, - can_be_multiline=True, - line_type='application') + previous_hl = HierarchyLevel(level_1=init_hl_depth, level_2=0, can_be_multiline=True, line_type="application") lines_with_labels[0] = lines_with_labels[0][0], "application" previous_line_start_of_application = False for line_id, (line, label) in enumerate(lines_with_labels): # postprocessing of others units - hierarchy_level, previous_hl = self._line_2level(text=line.line, - label=label, - init_hl_depth=init_hl_depth, - previous_hl=previous_hl) + hierarchy_level, previous_hl = self._line_2level(text=line.line, label=label, init_hl_depth=init_hl_depth, previous_hl=previous_hl) assert previous_hl is None or hierarchy_level == previous_hl # postprocess multiple applications @@ -58,12 +50,7 @@ def get_lines_with_hierarchy(self, if line_id == 0: hierarchy_level.can_be_multiline = False metadata.hierarchy_level = hierarchy_level - line = LineWithMeta( - line=line.line, - metadata=metadata, - annotations=line.annotations, - uid=line.uid - ) + line = LineWithMeta(line=line.line, metadata=metadata, annotations=line.annotations, uid=line.uid) result.append(line) return result @@ -84,9 +71,7 @@ def _line_2level(self, label = "structure_unit" if label == "structure_unit": - return self.structure_unit_builder.structure_unit(text=text, - init_hl_depth=init_hl_depth, - previous_hl=previous_hl) + return self.structure_unit_builder.structure_unit(text=text, init_hl_depth=init_hl_depth, previous_hl=previous_hl) elif label == "footer": return HierarchyLevel(None, None, False, HierarchyLevel.raw_text), None elif label == "raw_text" and previous_hl is not None and previous_hl.line_type == "chapter": @@ -99,12 +84,11 @@ def _line_2level(self, return HierarchyLevel(1, 1, False, "Other"), None elif label in ("application", "header", "raw_text"): - if label == "application" or (label == "raw_text" and - previous_hl is not None and - previous_hl.line_type == "application"): + application_continue = label == "raw_text" and previous_hl is not None and previous_hl.line_type == "application" + if label == "application" or application_continue: hl = HierarchyLevel(init_hl_depth, 0, True, "application") return hl, hl else: return HierarchyLevel.create_raw_text(), None else: - raise Exception("{} {}".format(text, label)) + raise Exception(f"{text} {label}") diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/abstract_body_hierarchy_level_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/abstract_body_hierarchy_level_builder.py index fb5c58d1..ec326538 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/abstract_body_hierarchy_level_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/abstract_body_hierarchy_level_builder.py @@ -1,11 +1,11 @@ import abc from copy import deepcopy -from typing import Tuple, Optional, List +from typing import List, Optional, Tuple from uuid import uuid1 from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor from dedoc.structure_extractors.feature_extractors.law_text_features import LawTextFeatures from dedoc.structure_extractors.hierarchy_level_builders.abstract_hierarchy_level_builder import AbstractHierarchyLevelBuilder @@ -32,9 +32,7 @@ def get_body_line(page_id: int = 0, line_id: int = 0, init_hl_depth: int = 1) -> page_id = page_id line_id = line_id return LineWithMeta(line="", - metadata=LineMetadata(hierarchy_level=HierarchyLevel(init_hl_depth, 0, False, "body"), - page_id=page_id, - line_id=line_id), + metadata=LineMetadata(hierarchy_level=HierarchyLevel(init_hl_depth, 0, False, "body"), page_id=page_id, line_id=line_id), annotations=[], uid=line_uid) @@ -60,12 +58,7 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s result.append(self.get_body_line(init_hl_depth=init_hl_depth)) is_body_begun = True - line = LineWithMeta( - line=line.line, - metadata=metadata, - annotations=line.annotations, - uid=line.uid - ) + line = LineWithMeta(line=line.line, metadata=metadata, annotations=line.annotations, uid=line.uid) result.append(line) if not is_body_begun: result.append(self.get_body_line(init_hl_depth=init_hl_depth)) @@ -99,4 +92,4 @@ def _line_2level(self, if label == "application": return HierarchyLevel(None, None, False, HierarchyLevel.raw_text), None else: - raise Exception("{} {}".format(text, label)) + raise Exception(f"{text} {label}") diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/body_foiv_hierarchy_level_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/body_foiv_hierarchy_level_builder.py index 94b30e81..794f27e1 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/body_foiv_hierarchy_level_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/body_builder/body_foiv_hierarchy_level_builder.py @@ -1,7 +1,7 @@ -from dedoc.structure_extractors.hierarchy_level_builders.law_builders.structure_unit.foiv_structure_unit import FoivStructureUnitBuilder -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_subitem, regexps_item_with_bracket from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.abstract_body_hierarchy_level_builder import \ AbstractBodyHierarchyLevelBuilder +from dedoc.structure_extractors.hierarchy_level_builders.law_builders.structure_unit.foiv_structure_unit import FoivStructureUnitBuilder +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_item_with_bracket, regexps_subitem class BodyFoivHierarchyLevelBuilder(AbstractBodyHierarchyLevelBuilder): diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/cellar_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/cellar_builder.py index f000edb3..22b156e7 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/cellar_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/cellar_builder.py @@ -1,5 +1,5 @@ from copy import deepcopy -from typing import Tuple, List +from typing import List, Tuple from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta @@ -15,7 +15,7 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s # detect begin of body hierarchy_level = HierarchyLevel(level_1=init_hl_depth, level_2=0, can_be_multiline=True, line_type="cellar") - for line, label in lines_with_labels: + for line, _ in lines_with_labels: # postprocessing of others units metadata = deepcopy(line.metadata) metadata.hierarchy_level = hierarchy_level diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/foiv_structure_unit.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/foiv_structure_unit.py index 5484e520..e8102e8b 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/foiv_structure_unit.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/foiv_structure_unit.py @@ -1,10 +1,9 @@ -from typing import Tuple, Optional +from typing import Optional, Tuple from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.structure_extractors.feature_extractors.law_text_features import LawTextFeatures from dedoc.structure_extractors.hierarchy_level_builders.law_builders.structure_unit.abstract_structure_unit import AbstractStructureUnit -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_subitem, regexps_item_with_bracket, \ - regexps_foiv_item +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_foiv_item, regexps_item_with_bracket, regexps_subitem class FoivStructureUnitBuilder(AbstractStructureUnit): diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/law_structure_unit.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/law_structure_unit.py index fe535f24..8be73e92 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/law_structure_unit.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/structure_unit/law_structure_unit.py @@ -1,10 +1,10 @@ -from typing import Tuple, Optional +from typing import Optional, Tuple from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor from dedoc.structure_extractors.feature_extractors.law_text_features import LawTextFeatures from dedoc.structure_extractors.hierarchy_level_builders.law_builders.structure_unit.abstract_structure_unit import AbstractStructureUnit -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_subitem, regexps_item_with_bracket, regexps_foiv_item +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_foiv_item, regexps_item_with_bracket, regexps_subitem class LawStructureUnitBuilder(AbstractStructureUnit): diff --git a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/stub_hierarchy_level_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/stub_hierarchy_level_builder.py index b34c35b8..cffa168e 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/law_builders/stub_hierarchy_level_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/law_builders/stub_hierarchy_level_builder.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta diff --git a/dedoc/structure_extractors/hierarchy_level_builders/toc_builder/toc_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/toc_builder/toc_builder.py index 2835bc00..2ac22f8e 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/toc_builder/toc_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/toc_builder/toc_builder.py @@ -1,19 +1,17 @@ from typing import List, Tuple from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.hierarchy_level_builders.abstract_hierarchy_level_builder import AbstractHierarchyLevelBuilder class TocBuilder(AbstractHierarchyLevelBuilder): - def get_lines_with_hierarchy(self, - lines_with_labels: List[Tuple[LineWithMeta, str]], - init_hl_depth: int) -> List[LineWithMeta]: + def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, str]], init_hl_depth: int) -> List[LineWithMeta]: # TODO add analyse toc if tag 'toc' and 'toc_item' exist result = [] is_toc_begun = False - for line, prediction in lines_with_labels: + for line, _ in lines_with_labels: if line.line.lower().strip() in ("содержание", "оглавление"): # set line as toc line.metadata.hierarchy_level = HierarchyLevel(init_hl_depth + 0, 0, False, "toc") result.append(line) diff --git a/dedoc/structure_extractors/hierarchy_level_builders/tz_builder/body_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/tz_builder/body_builder.py index 4f94c930..a94b0d2b 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/tz_builder/body_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/tz_builder/body_builder.py @@ -1,4 +1,4 @@ -from typing import Tuple, List, Optional +from typing import List, Optional, Tuple from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta @@ -11,9 +11,7 @@ class TzBodyBuilder(AbstractHierarchyLevelBuilder): - def get_lines_with_hierarchy(self, - lines_with_labels: List[Tuple[LineWithMeta, str]], - init_hl_depth: int) -> List[LineWithMeta]: + def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, str]], init_hl_depth: int) -> List[LineWithMeta]: if len(lines_with_labels) > 0: line = lines_with_labels[0][0] page_id = line.metadata.page_id @@ -25,9 +23,9 @@ def get_lines_with_hierarchy(self, previous_hl = None for line, prediction in lines_with_labels: if prediction in ("part", "named_item", "item"): - # TODO: add analyse tag 'header' if tag exist then analyse what type of header here by using regexps + # TODO: add analyse tag "header" if tag exist then analyse what type of header here by using regexps # (part, named_item, number, NonLetterPrefix.regexp, TzTextFeatures.item_regexp ) - # Q: set HL of tag 'header'? A: (need analyse document) in some all headers can have the same HL, in the other document otherside + # Q: set HL of tag "header"? A: (need analyse document) in some all headers can have the same HL, in the other document otherside # I think we must set HL of regular expression # For Understanding header you need example of doc files. line = self.__handle_item(init_hl_depth, line, prediction, previous_hl=previous_hl) @@ -38,11 +36,7 @@ def get_lines_with_hierarchy(self, result.append(line) return result - def __handle_item(self, - init_hl_depth: int, - line: LineWithMeta, - prediction: str, - previous_hl: Optional[HierarchyLevel]) -> LineWithMeta: + def __handle_item(self, init_hl_depth: int, line: LineWithMeta, prediction: str, previous_hl: Optional[HierarchyLevel]) -> LineWithMeta: text = line.line.lower().strip() item_min_depth = 5 + init_hl_depth if prediction == "part": @@ -55,7 +49,7 @@ def __handle_item(self, elif TzTextFeatures.number_regexp.match(text): match = TzTextFeatures.number_regexp.match(text) number = text[match.start(): match.end()] - number_splitted = [n for n in number.strip().split('.') if n.isnumeric()] + number_splitted = [n for n in number.strip().split(".") if n.isnumeric()] hierarchy_level = HierarchyLevel(item_min_depth + 3, len(number_splitted), False, prediction) elif BulletPrefix.regexp.match(text): hierarchy_level = HierarchyLevel(item_min_depth + 4, 0, False, prediction) diff --git a/dedoc/structure_extractors/hierarchy_level_builders/utils_reg.py b/dedoc/structure_extractors/hierarchy_level_builders/utils_reg.py index d6096c4c..32ca7a41 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/utils_reg.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/utils_reg.py @@ -1,19 +1,19 @@ import re # item parse \d -regexps_item = re.compile(r'^\s*\d+\.\s') -regexps_foiv_item = re.compile(r'^\s*(\d+\.)+\s*') -regexps_item_with_bracket = re.compile(r'^\s*(\d*\.)*\d+[)}]') -regexps_digits_with_dots = re.compile(r'^\s*(\d+\.)+(\d+)?\s*') +regexps_item = re.compile(r"^\s*\d+\.\s") +regexps_foiv_item = re.compile(r"^\s*(\d+\.)+\s*") +regexps_item_with_bracket = re.compile(r"^\s*(\d*\.)*\d+[)}]") +regexps_digits_with_dots = re.compile(r"^\s*(\d+\.)+(\d+)?\s*") # subitem parse [а-яё] -regexps_subitem_with_dots = re.compile(r'^\s*((\d+\.((\d+|[а-яё])\.)+)|[а-яё][.)])\s') -regexps_subitem_extended = re.compile(r'^\s*[A-ZА-Яa-zа-яё][)}.]') -regexps_subitem = re.compile(r'^\s*[а-яё][)}]') +regexps_subitem_with_dots = re.compile(r"^\s*((\d+\.((\d+|[а-яё])\.)+)|[а-яё][.)])\s") +regexps_subitem_extended = re.compile(r"^\s*[A-ZА-Яa-zа-яё][)}.]") +regexps_subitem = re.compile(r"^\s*[а-яё][)}]") # number -regexps_number = re.compile(r'(^\s*\d{1,2}(\.\d{1,2})*)(\s|$|\)|\}|\.([A-ZА-Яa-zа-яё]|\s))') -regexps_ends_of_number = re.compile(r'([A-ZА-Яa-zа-яё]|\s|( )*)$') +regexps_number = re.compile(r"(^\s*\d{1,2}(\.\d{1,2})*)(\s|$|\)|\}|\.([A-ZА-Яa-zа-яё]|\s))") +regexps_ends_of_number = re.compile(r"([A-ZА-Яa-zа-яё]|\s|( )*)$") # others regexps_year = re.compile(r"(19\d\d|20\d\d)") diff --git a/dedoc/structure_extractors/line_type_classifiers/law_classifier.py b/dedoc/structure_extractors/line_type_classifiers/law_classifier.py index 592dd5cf..a8677189 100644 --- a/dedoc/structure_extractors/line_type_classifiers/law_classifier.py +++ b/dedoc/structure_extractors/line_type_classifiers/law_classifier.py @@ -31,9 +31,7 @@ def predict(self, lines: List[LineWithMeta]) -> List[str]: raw_text_id = list(self.classifier.classes_).index("raw_text") labels_probability[inside_quotes, raw_text_id] = 1 labels = [self.classifier.classes_[label_id] for label_id in labels_probability.argmax(1)] - content_start = [line_id for line_id, label in enumerate(labels) - if self.__match_body_begin(lines[line_id].line, label) or - self.regexp_application_begin.match(lines[line_id].line.lower().strip())] + content_start = [line_id for line_id, (label, line) in enumerate(zip(labels, lines)) if self.__match_body_begin(line.line, label)] header_end = min(content_start) if len(content_start) else len(labels) - 1 # preparing header_id features header_id = list(self.classifier.classes_).index("header") @@ -45,6 +43,6 @@ def predict(self, lines: List[LineWithMeta]) -> List[str]: return labels def __match_body_begin(self, text: str, label: str) -> bool: - return (label == "structure_unit" or - label in ("header", "raw_text") and - any(regexp.match(text.strip()) for regexp in LawTextFeatures.named_regexp)) + body_started = label in ("header", "raw_text") and any(regexp.match(text.strip()) for regexp in LawTextFeatures.named_regexp) + application_started = self.regexp_application_begin.match(text.lower().strip()) + return label == "structure_unit" or body_started or application_started diff --git a/dedoc/train_dataset/data_structures/line_with_label.py b/dedoc/train_dataset/data_structures/line_with_label.py index 2ab10c70..58bd35df 100644 --- a/dedoc/train_dataset/data_structures/line_with_label.py +++ b/dedoc/train_dataset/data_structures/line_with_label.py @@ -8,13 +8,7 @@ class LineWithLabel(LineWithMeta): - def __init__(self, - line: str, - metadata: LineMetadata, - annotations: List[Annotation], - label: str, - group: str, - uid: str = None) -> None: + def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotation], label: str, group: str, uid: str = None) -> None: super().__init__(line=line, metadata=metadata, annotations=annotations, uid=uid) self.group = group self.label = label diff --git a/dedoc/train_dataset/data_structures/task_item.py b/dedoc/train_dataset/data_structures/task_item.py index 196ad51c..17017803 100644 --- a/dedoc/train_dataset/data_structures/task_item.py +++ b/dedoc/train_dataset/data_structures/task_item.py @@ -1,20 +1,13 @@ from collections import OrderedDict -from typing import Optional, List +from typing import List, Optional from dedoc.data_structures.serializable import Serializable class TaskItem(Serializable): - def __init__(self, - task_id: int, - task_path: str, - data: any, - labeled: Optional[List[str]], - additional_info: str = "", - default_label: str = None) -> None: + def __init__(self, task_id: int, task_path: str, data: any, labeled: Optional[List[str]], additional_info: str = "", default_label: str = None) -> None: """ - @param task_id: id of this item. unique in one task @param task_path: relative path to image @param data: any data, (features, comments and so on, it should not be used in annotation process and flow to diff --git a/dedoc/train_dataset/exceptions/empty_page_error.py b/dedoc/train_dataset/exceptions/empty_page_error.py new file mode 100644 index 00000000..b17924f4 --- /dev/null +++ b/dedoc/train_dataset/exceptions/empty_page_error.py @@ -0,0 +1,7 @@ +from dedoc.train_dataset.exceptions.train_dataset_error import TrainDatasetError + + +class EmptyPageError(TrainDatasetError): + + def __init__(self, message: str) -> None: + super().__init__(message) diff --git a/dedoc/train_dataset/exceptions/empty_page_exception.py b/dedoc/train_dataset/exceptions/empty_page_exception.py deleted file mode 100644 index e50bf0a0..00000000 --- a/dedoc/train_dataset/exceptions/empty_page_exception.py +++ /dev/null @@ -1,7 +0,0 @@ -from dedoc.train_dataset.exceptions.train_dataset_exception import TrainDatasetException - - -class EmptyPageException(TrainDatasetException): - - def __init__(self, message: str) -> None: - super().__init__(message) diff --git a/dedoc/train_dataset/exceptions/task_creation_error.py b/dedoc/train_dataset/exceptions/task_creation_error.py new file mode 100644 index 00000000..7f5d7710 --- /dev/null +++ b/dedoc/train_dataset/exceptions/task_creation_error.py @@ -0,0 +1,7 @@ +from dedoc.train_dataset.exceptions.train_dataset_error import TrainDatasetError + + +class TaskCreationError(TrainDatasetError): + + def __init__(self, message: str) -> None: + super().__init__(message) diff --git a/dedoc/train_dataset/exceptions/task_creation_exception.py b/dedoc/train_dataset/exceptions/task_creation_exception.py deleted file mode 100644 index 569ae5fa..00000000 --- a/dedoc/train_dataset/exceptions/task_creation_exception.py +++ /dev/null @@ -1,7 +0,0 @@ -from dedoc.train_dataset.exceptions.train_dataset_exception import TrainDatasetException - - -class TaskCreationException(TrainDatasetException): - - def __init__(self, message: str) -> None: - super().__init__(message) diff --git a/dedoc/train_dataset/exceptions/train_dataset_exception.py b/dedoc/train_dataset/exceptions/train_dataset_error.py similarity index 80% rename from dedoc/train_dataset/exceptions/train_dataset_exception.py rename to dedoc/train_dataset/exceptions/train_dataset_error.py index a3077be4..e55fe916 100644 --- a/dedoc/train_dataset/exceptions/train_dataset_exception.py +++ b/dedoc/train_dataset/exceptions/train_dataset_error.py @@ -1,4 +1,4 @@ -class TrainDatasetException(Exception): +class TrainDatasetError(Exception): """ Raise if there is some problem with completing new train dataset. """ diff --git a/dedoc/train_dataset/exceptions/unknown_task.py b/dedoc/train_dataset/exceptions/unknown_task_error.py similarity index 52% rename from dedoc/train_dataset/exceptions/unknown_task.py rename to dedoc/train_dataset/exceptions/unknown_task_error.py index 325c8326..b6257a12 100644 --- a/dedoc/train_dataset/exceptions/unknown_task.py +++ b/dedoc/train_dataset/exceptions/unknown_task_error.py @@ -1,7 +1,7 @@ -from dedoc.train_dataset.exceptions.train_dataset_exception import TrainDatasetException +from dedoc.train_dataset.exceptions.train_dataset_error import TrainDatasetError -class UnknownTaskException(TrainDatasetException): +class UnknownTaskError(TrainDatasetError): """ Raise if you try to create dataset with unknown type """ diff --git a/dedoc/train_dataset/extractors/line_with_meta_extractor.py b/dedoc/train_dataset/extractors/line_with_meta_extractor.py index 3ea38790..25bc01bc 100644 --- a/dedoc/train_dataset/extractors/line_with_meta_extractor.py +++ b/dedoc/train_dataset/extractors/line_with_meta_extractor.py @@ -62,7 +62,7 @@ def _get_lines(self, document_name: str, labels: List[dict]) -> List[LineWithLab elif document_name.endswith(".html"): reader = self.html_reader else: - raise Exception("Unknown document type {}".format(document_name)) + raise Exception(f"Unknown document type {document_name}") document = reader.read(os.path.join(self.documents_path, document_name), parameters={}) lines = document.lines return self.__add_labels(document_name, labels, lines) @@ -119,14 +119,8 @@ def _lines_from_image(self, document_name: str, labels: List[dict]) -> List[Line else: image = convert_from_path(path, first_page=page_num + 1, last_page=page_num + 2)[0] - page_with_bboxes = PageWithBBox( - image=np.array(image), - page_num=page_num, - bboxes=bboxes - ) - label_dict = { - data["data"]["_uid"]: data["labeled"][0] for data in labels - } + page_with_bboxes = PageWithBBox(image=np.array(image), page_num=page_num, bboxes=bboxes) + label_dict = {data["data"]["_uid"]: data["labeled"][0] for data in labels} lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page_with_bboxes) for line in lines: @@ -141,5 +135,5 @@ def _lines_from_image(self, document_name: str, labels: List[dict]) -> List[Line ) result.append(line_with_label) else: - print("unknown line {}".format(line.uid)) # noqa + print(f"unknown line {line.uid}") # noqa return result diff --git a/dedoc/train_dataset/task_manager.py b/dedoc/train_dataset/task_manager.py index bc15e245..eb8587cf 100644 --- a/dedoc/train_dataset/task_manager.py +++ b/dedoc/train_dataset/task_manager.py @@ -6,7 +6,7 @@ from tempfile import TemporaryDirectory from typing import Dict, Union -from flask import Flask, request, send_file, Response +from flask import Flask, Response, request, send_file app = Flask(__name__, static_folder=os.path.dirname(__file__)) @@ -21,7 +21,7 @@ form_results = file.read() -@app.route('/', methods=['GET']) +@app.route("/", methods=["GET"]) def get_info() -> str: if len(tasks) > 0: return form_input.format(tasks_left=len(tasks)) @@ -30,7 +30,7 @@ def get_info() -> str:

Получить результаты

""" -@app.route('/upload', methods=['POST']) +@app.route("/upload", methods=["POST"]) def upload() -> Union[str, Response]: parameters = {k: v for k, v in request.values.items()} name = parameters.get("name", "Инкогнито") @@ -39,14 +39,14 @@ def upload() -> Union[str, Response]: else: task = tasks.pop() with open("task_manager.log", "a") as file_log: - file_log.write("{} take task {}\n".format(name, task)) + file_log.write(f"{name} take task {task}\n") return send_file(task, as_attachment=True, attachment_filename=task) -@app.route('/upload_results', methods=['POST', "GET"]) +@app.route("/upload_results", methods=["POST", "GET"]) def upload_results() -> Response: if request.method == "POST": - file = request.files['file'] + file = request.files["file"] with TemporaryDirectory() as tmp_dir: name = file.filename path_out = os.path.join(tmp_dir, name) @@ -62,7 +62,7 @@ def upload_results() -> Response: path_out = os.path.join(tmp_dir, file_name) archive.extract(member=file_name, path=tmp_dir) _save_result_file(path_out, file_name) - return '

Результат получен {}

'.format(cnt) + return f"

Результат получен {cnt}

" if request.method == "GET": return form_results @@ -71,13 +71,13 @@ def _save_result_file(path: str, name: str) -> None: path_out = os.path.abspath(os.path.join(results_dir, name)) shutil.copy(path, path_out) with open("task_manager.log", "a") as file_log: - file_log.write("save file in {}\n".format(path_out)) + file_log.write(f"save file in {path_out}\n") -@app.route('/get_results', methods=["GET"]) +@app.route("/get_results", methods=["GET"]) def get_results() -> Response: with TemporaryDirectory() as tmp_dir: - archive_name = "results_{}.zip".format(int(time.time())) + archive_name = f"results_{int(time.time())}.zip" archive_path = os.path.join(tmp_dir, archive_name) with zipfile.ZipFile(archive_path, "w") as archive: labeled = _merge_labeled() @@ -88,7 +88,7 @@ def get_results() -> Response: files = [file for file in original_documents.namelist() if file in original_documents_set] for file in files: with original_documents.open(file) as f_in: - archive.writestr("original_documents/{}".format(file), f_in.read()) + archive.writestr(f"original_documents/{file}", f_in.read()) archive.write("task_manager.log") return send_file(archive_path, as_attachment=True, attachment_filename=archive_name) @@ -107,5 +107,5 @@ def _merge_labeled() -> Dict[str, dict]: return labeled -if __name__ == '__main__': +if __name__ == "__main__": app.run(host="0.0.0.0", port=3000) diff --git a/dedoc/train_dataset/taskers/concrete_taskers/abstract_line_label_tasker.py b/dedoc/train_dataset/taskers/concrete_taskers/abstract_line_label_tasker.py index 669033e0..877cd4ac 100644 --- a/dedoc/train_dataset/taskers/concrete_taskers/abstract_line_label_tasker.py +++ b/dedoc/train_dataset/taskers/concrete_taskers/abstract_line_label_tasker.py @@ -5,15 +5,14 @@ import uuid import zipfile from abc import abstractmethod -from collections import defaultdict, OrderedDict +from collections import OrderedDict, defaultdict from tempfile import TemporaryDirectory -from typing import List, Iterable, Callable - -from dedoc.train_dataset.train_dataset_utils import get_original_document_path +from typing import Callable, Iterable, List from dedoc.train_dataset.data_structures.images_archive import ImagesArchive from dedoc.train_dataset.data_structures.task_item import TaskItem from dedoc.train_dataset.taskers.concrete_taskers.abstract_tasker import AbstractTasker +from dedoc.train_dataset.train_dataset_utils import get_original_document_path class AbstractLineLabelTasker(AbstractTasker): @@ -63,11 +62,8 @@ def create_tasks(self, task_size: int, tasks_uid: str = None) -> Iterable[str]: random.shuffle(pages) batches = list(enumerate(self._task_batch(pages=pages, size=task_size))) for task_id, task in batches: - self.progress_bar[tasks_uid] = "done = {}; total = {} in progress = 1".format(task_id, len(batches)) - path = self._create_one_task(task=task, - task_id=task_id, - job_uid=tasks_uid, - images=images) + self.progress_bar[tasks_uid] = f"done = {task_id}; total = {len(batches)} in progress = 1" + path = self._create_one_task(task=task, task_id=task_id, job_uid=tasks_uid, images=images) yield path os.remove(path) @@ -103,14 +99,10 @@ def _task_batch(self, pages: Iterable[List[dict]], size: int) -> Iterable[List[L if len(task) > 0: yield task - def _create_one_task(self, - task: List[List[dict]], - task_id: int, - job_uid: str, - *, images: ImagesArchive) -> str: - task_name = "{:06d}_{}".format(task_id, "".join(random.sample(self._symbols, 3))) - task_directory = "task_{}".format(task_name) - path = os.path.join(self.tmp_dir, "{}.zip".format(task_directory)) + def _create_one_task(self, task: List[List[dict]], task_id: int, job_uid: str, *, images: ImagesArchive) -> str: + task_name = f"{task_id:06d}_{''.join(random.sample(self._symbols, 3))}" + task_directory = f"task_{task_name}" + path = os.path.join(self.tmp_dir, f"{task_directory}.zip") task_items = OrderedDict() item_id = 0 with zipfile.ZipFile(path, "w") as task_archive: @@ -123,32 +115,21 @@ def _create_one_task(self, page = [line for line in page if line["_line"].strip() != ""] if len(page) == 0: continue - items = self._one_scanned_page(page=page, - task_archive=task_archive, - task_directory=task_directory, - images=images) + items = self._one_scanned_page(page=page, task_archive=task_archive, task_directory=task_directory, images=images) for item in items: item.task_id = item_id item_id += 1 task_items[item.task_id] = item.to_dict() self.progress_bar[job_uid] = self.progress_bar.get(job_uid, "").split("\n")[0] - self.progress_bar[job_uid] += "\n done = {} total = {}".format(page_id, len(task)) - task_archive.writestr("{}/tasks.json".format(task_directory), - json.dumps(task_items, ensure_ascii=False, indent=4).encode("utf-8")) + self.progress_bar[job_uid] += f"\n done = {page_id} total = {len(task)}" + task_archive.writestr(f"{task_directory}/tasks.json", json.dumps(task_items, ensure_ascii=False, indent=4).encode("utf-8")) task_archive.write(self.manifest_path, os.path.join(task_directory, os.path.basename(self.manifest_path))) - self._add_config(task_archive, task_name=task_name, task_directory=task_directory, - config_path=self.config_path, tmp_dir=self.tmp_dir) - self._add_docker_files(archive=task_archive, - task_directory=task_directory, - dockerfile_directory="img_classifier_dockerfile") + self._add_config(task_archive, task_name=task_name, task_directory=task_directory, config_path=self.config_path, tmp_dir=self.tmp_dir) + self._add_docker_files(archive=task_archive, task_directory=task_directory, dockerfile_directory="img_classifier_dockerfile") return path @abstractmethod - def _one_scanned_page(self, - page: List[dict], - task_archive: zipfile.ZipFile, - task_directory: str, *, - images: ImagesArchive) -> List[TaskItem]: + def _one_scanned_page(self, page: List[dict], task_archive: zipfile.ZipFile, task_directory: str, *, images: ImagesArchive) -> List[TaskItem]: pass def _get_pages(self) -> List[List[dict]]: diff --git a/dedoc/train_dataset/taskers/concrete_taskers/abstract_tasker.py b/dedoc/train_dataset/taskers/concrete_taskers/abstract_tasker.py index 44c60a9d..3141f039 100644 --- a/dedoc/train_dataset/taskers/concrete_taskers/abstract_tasker.py +++ b/dedoc/train_dataset/taskers/concrete_taskers/abstract_tasker.py @@ -1,8 +1,8 @@ import json import os import zipfile -from typing import Iterable, List from abc import ABC, abstractmethod +from typing import Iterable, List class AbstractTasker(ABC): @@ -14,8 +14,8 @@ def create_tasks(self, task_size: int, tasks_uid: str) -> Iterable[str]: """ create tasks one by one, put them into zip archive and return path to this archive. warning it will remove archive with previous task before starts to form new task. - Every task archive should be self contained: it mast include images, task config, manifest and so on - @param task_size: size of one task, task should not be large than this. For example number of page. + Every task archive should be self contained: it must include images, task config, manifest and so on + @param task_size: size of one task, task should not be larger than this. For example number of page. @param tasks_uid: uid of task @return: path to the zip archive with task. """ @@ -36,12 +36,9 @@ def _read_json(self, path: str, required: bool = False) -> List[dict]: elif not required: return [] else: - raise Exception("file not found {}".format(path)) + raise Exception(f"file not found {path}") - def _add_docker_files(self, - archive: zipfile.ZipFile, - task_directory: str, - dockerfile_directory: str) -> zipfile.ZipFile: + def _add_docker_files(self, archive: zipfile.ZipFile, task_directory: str, dockerfile_directory: str) -> zipfile.ZipFile: """ add some common files to task archive (in place) @param archive: task archive @@ -51,19 +48,14 @@ def _add_docker_files(self, """ docker_path = os.path.join(self.resources_path, "train_dataset", dockerfile_directory, "Dockerfile") - archive.write(filename=docker_path, arcname="{}/Dockerfile".format(task_directory)) + archive.write(filename=docker_path, arcname=f"{task_directory}/Dockerfile") readme_path = os.path.join(self.resources_path, "train_dataset", dockerfile_directory, "README.md") - archive.write(filename=readme_path, arcname="{}/README.md".format(task_directory)) + archive.write(filename=readme_path, arcname=f"{task_directory}/README.md") run_path = os.path.join(self.resources_path, "train_dataset", dockerfile_directory, "run.sh") - archive.write(filename=run_path, arcname="{}/run.sh".format(task_directory)) + archive.write(filename=run_path, arcname=f"{task_directory}/run.sh") return archive - def _add_config(self, - task_archive: zipfile.ZipFile, - task_name: str, - task_directory: str, - config_path: str, - tmp_dir: str) -> None: + def _add_config(self, task_archive: zipfile.ZipFile, task_name: str, task_directory: str, config_path: str, tmp_dir: str) -> None: config = json.load(open(config_path)) config["output_path"] = config.get("output_path", "labeled.json").replace("TASK_ID", task_name) new_config_path = os.path.join(tmp_dir, os.path.basename(config_path)) diff --git a/dedoc/train_dataset/taskers/concrete_taskers/filtered_line_label_tasker.py b/dedoc/train_dataset/taskers/concrete_taskers/filtered_line_label_tasker.py index 8b0f622d..190921da 100644 --- a/dedoc/train_dataset/taskers/concrete_taskers/filtered_line_label_tasker.py +++ b/dedoc/train_dataset/taskers/concrete_taskers/filtered_line_label_tasker.py @@ -1,4 +1,4 @@ -from typing import List, Callable +from typing import Callable, List from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta diff --git a/dedoc/train_dataset/taskers/concrete_taskers/header_footer_tasker.py b/dedoc/train_dataset/taskers/concrete_taskers/header_footer_tasker.py index fffcbdd4..4ecff06b 100644 --- a/dedoc/train_dataset/taskers/concrete_taskers/header_footer_tasker.py +++ b/dedoc/train_dataset/taskers/concrete_taskers/header_footer_tasker.py @@ -1,14 +1,14 @@ import zipfile from collections import defaultdict -from typing import List, Callable +from typing import Callable, List from dedoc.train_dataset.data_structures.images_archive import ImagesArchive from dedoc.train_dataset.data_structures.task_item import TaskItem +from dedoc.train_dataset.taskers.concrete_taskers.line_label_tasker import LineLabelTasker from dedoc.train_dataset.taskers.images_creators.concrete_creators.docx_images_creator import DocxImagesCreator from dedoc.train_dataset.taskers.images_creators.concrete_creators.scanned_images_creator import ScannedImagesCreator from dedoc.train_dataset.taskers.images_creators.concrete_creators.txt_images_creator import TxtImagesCreator from dedoc.train_dataset.taskers.images_creators.image_creator_composition import ImageCreatorComposition -from dedoc.train_dataset.taskers.concrete_taskers.line_label_tasker import LineLabelTasker class HeaderFooterTasker(LineLabelTasker): @@ -26,19 +26,14 @@ def __init__(self, item2label: Callable = None, *, config: dict) -> None: - super().__init__(path2bboxes, path2lines, path2docs, manifest_path, config_path, tmp_dir, progress_bar, - item2label, config=config) + super().__init__(path2bboxes, path2lines, path2docs, manifest_path, config_path, tmp_dir, progress_bar, item2label, config=config) self.images_creators = ImageCreatorComposition(creators=[ ScannedImagesCreator(path2docs=self.path2docs), DocxImagesCreator(path2docs=self.path2docs, config=config), TxtImagesCreator(path2docs=self.path2docs, config=config) ]) - def _one_scanned_page(self, - page: List[dict], - task_archive: zipfile.ZipFile, - task_directory: str, *, - images: ImagesArchive) -> List[TaskItem]: + def _one_scanned_page(self, page: List[dict], task_archive: zipfile.ZipFile, task_directory: str, *, images: ImagesArchive) -> List[TaskItem]: group_by_page_num = defaultdict(list) for line in page: page_id = line["_metadata"]["page_id"] @@ -49,10 +44,7 @@ def _one_scanned_page(self, if len(page) > self.top_lines_count + self.bottom_lines_count: page = page[:self.top_lines_count] + page[-self.bottom_lines_count:] res.extend(page) - return super(HeaderFooterTasker, self)._one_scanned_page(page=res, - task_archive=task_archive, - task_directory=task_directory, - images=images) + return super(HeaderFooterTasker, self)._one_scanned_page(page=res, task_archive=task_archive, task_directory=task_directory, images=images) @staticmethod def _get_line_rank(line: dict) -> int: diff --git a/dedoc/train_dataset/taskers/concrete_taskers/line_label_tasker.py b/dedoc/train_dataset/taskers/concrete_taskers/line_label_tasker.py index 9cd0c07a..fe9306e6 100644 --- a/dedoc/train_dataset/taskers/concrete_taskers/line_label_tasker.py +++ b/dedoc/train_dataset/taskers/concrete_taskers/line_label_tasker.py @@ -1,7 +1,7 @@ import os import zipfile from io import BytesIO -from typing import List, Callable +from typing import Callable, List from dedoc.train_dataset.data_structures.images_archive import ImagesArchive from dedoc.train_dataset.data_structures.task_item import TaskItem @@ -25,8 +25,7 @@ def __init__(self, item2label: Callable = None, *, config: dict) -> None: - super().__init__(path2bboxes, path2lines, path2docs, manifest_path, config_path, tmp_dir, progress_bar, - item2label, config=config) + super().__init__(path2bboxes, path2lines, path2docs, manifest_path, config_path, tmp_dir, progress_bar, item2label, config=config) self.images_creators = ImageCreatorComposition(creators=[ ScannedImagesCreator(path2docs=self.path2docs), DocxImagesCreator(path2docs=self.path2docs, config=config), @@ -41,26 +40,21 @@ def _create_images(self, pages: List[List[dict]], tmpdir: str) -> ImagesArchive: line["color"] = color return self.images_creators.create_images(pages=pages, tmpdir=tmpdir) - def _one_scanned_page(self, - page: List[dict], - task_archive: zipfile.ZipFile, - task_directory: str, *, - images: ImagesArchive) -> List[TaskItem]: + def _one_scanned_page(self, page: List[dict], task_archive: zipfile.ZipFile, task_directory: str, *, images: ImagesArchive) -> List[TaskItem]: self._page_counter += 1 task_items = [] for i, line in enumerate(page): - uid = line['_uid'] - image_bbox_name = "images/{:0>6d}_{:0>6d}_img_bbox_{}.jpg".format(self._page_counter, i, uid) - image_bbox = images.get_page_by_uid("{}.jpg".format(uid)) + uid = line["_uid"] + image_bbox_name = f"images/{self._page_counter:0>6d}_{i:0>6d}_img_bbox_{uid}.jpg" + image_bbox = images.get_page_by_uid(f"{uid}.jpg") if image_bbox is None: if not uid.endswith("_split"): - self.logger.warn("uid {} not found".format(uid)) + self.logger.warn(f"uid {uid} not found") continue with BytesIO() as buffer: - image_bbox.convert('RGB').save(fp=buffer, format="jpeg") - task_archive.writestr(zinfo_or_arcname=os.path.join(task_directory, image_bbox_name), - data=buffer.getvalue()) + image_bbox.convert("RGB").save(fp=buffer, format="jpeg") + task_archive.writestr(zinfo_or_arcname=os.path.join(task_directory, image_bbox_name), data=buffer.getvalue()) line_id = line["_metadata"]["line_id"] page_id = line["_metadata"]["page_id"] @@ -69,9 +63,8 @@ def _one_scanned_page(self, task_id=len(task_items), task_path=image_bbox_name, data=line, - labeled=[line["_metadata"]['hierarchy_level']['line_type']], - additional_info="

page_id {}

line_id {}

text {}

".format( - page_id, line_id, text), + labeled=[line["_metadata"]["hierarchy_level"]["line_type"]], + additional_info=f"

page_id {page_id}

line_id {line_id}

text {text}

", default_label=self.item2label(line) ) task_items.append(task_item) diff --git a/dedoc/train_dataset/taskers/concrete_taskers/table_tasker.py b/dedoc/train_dataset/taskers/concrete_taskers/table_tasker.py index df11a4cc..cbad1231 100644 --- a/dedoc/train_dataset/taskers/concrete_taskers/table_tasker.py +++ b/dedoc/train_dataset/taskers/concrete_taskers/table_tasker.py @@ -10,8 +10,8 @@ from dedoc.train_dataset.data_path_config import table_path from dedoc.train_dataset.data_structures.task_item import TaskItem from dedoc.train_dataset.taskers.concrete_taskers.abstract_tasker import AbstractTasker -from dedoc.utils.utils import get_batch from dedoc.utils.image_utils import draw_rectangle +from dedoc.utils.utils import get_batch class File: @@ -37,15 +37,13 @@ def create_tasks(self, task_size: int, tasks_uid: str) -> Iterable[str]: files = self._get_files() with tempfile.TemporaryDirectory() as tmp_dir: for i, batch in enumerate(get_batch(task_size, files)): - task_directory = "task_{:03d}".format(i) - archive_path = "/tmp/{}.zip".format(task_directory) - image_directory = "{}/images".format(task_directory) + task_directory = f"task_{i:03d}" + archive_path = f"/tmp/{task_directory}.zip" + image_directory = f"{task_directory}/images" with ZipFile(archive_path, "a") as task_archive: self.__add_task(archive=task_archive, files=batch, task_directory=task_directory) dockerfile_directory = os.path.join(self.resources_path, "train_dataset/img_classifier_dockerfile") - self._add_docker_files(archive=task_archive, - task_directory=task_directory, - dockerfile_directory=dockerfile_directory) + self._add_docker_files(archive=task_archive, task_directory=task_directory, dockerfile_directory=dockerfile_directory) self._add_config(task_archive=task_archive, task_name=task_directory, task_directory=task_directory, @@ -58,15 +56,14 @@ def __add_task(self, archive: ZipFile, files: List[File], task_directory: str) - task_items = {} for task_id, file in enumerate(files): data = file.data - data["original_document"] = "{}.png".format(file.name) + data["original_document"] = f"{file.name}.png" task_items[task_id] = TaskItem(task_id=task_id, - task_path="images/{}".format(os.path.basename(file.image_path)), + task_path=f"images/{os.path.basename(file.image_path)}", labeled=None, data=data, additional_info="", default_label="table").to_dict() - archive.writestr("{}/tasks.json".format(task_directory), - json.dumps(task_items, ensure_ascii=False, indent=4).encode("utf-8")) + archive.writestr(f"{task_directory}/tasks.json", json.dumps(task_items, ensure_ascii=False, indent=4).encode("utf-8")) def get_original_documents(self) -> str: archive_path = "/tmp/original_documents.zip" @@ -77,8 +74,8 @@ def _get_files(self) -> List[File]: files = {file.split(".")[0] for file in os.listdir(table_path)} result = [] for file_name in sorted(files): - image_path = os.path.join(table_path, "{}.png".format(file_name)) - json_path = os.path.join(table_path, "{}.json".format(file_name)) + image_path = os.path.join(table_path, f"{file_name}.png") + json_path = os.path.join(table_path, f"{file_name}.json") file = File(image_path=image_path, json_path=json_path) result.append(file) return result @@ -104,6 +101,6 @@ def __add_images(self, files: List[File], archive: ZipFile, image_directory: str height=bbox.height, color=(255, 0, 0)) image_rectangle = Image.fromarray(image_rectangle) - image_path = os.path.join(tmpdir, "{}.png".format(file.name)) + image_path = os.path.join(tmpdir, f"{file.name}.png") image_rectangle.save(image_path) - archive.write(image_path, "{}/{}.png".format(image_directory, file.name)) + archive.write(image_path, f"{image_directory}/{file.name}.png") diff --git a/dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py b/dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py index 0ba8c788..f6ec7b54 100644 --- a/dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py +++ b/dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py @@ -8,7 +8,7 @@ from collections import defaultdict from collections import namedtuple from copy import deepcopy -from typing import Iterator, Optional, Dict, Iterable, Tuple +from typing import Dict, Iterable, Iterator, Optional, Tuple from typing import List import numpy as np @@ -17,7 +17,7 @@ from bs4 import BeautifulSoup from pdf2image import convert_from_path -from dedoc.common.exceptions.conversion_exception import ConversionException +from dedoc.common.exceptions.conversion_error import ConversionError from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader @@ -36,8 +36,8 @@ def __init__(self, path2docs: str, *, config: dict) -> None: self.first_color = 15 self.base_color = 0 self.base_color_step = 1 - self.many_colors_file_name = 'many_colors_doc' - self.two_colors_file_name = 'two_colors_doc' + self.many_colors_file_name = "many_colors_doc" + self.two_colors_file_name = "two_colors_doc" self.config = config self.logger = self.config.get("logger", logging.getLogger()) @@ -50,7 +50,7 @@ def add_images(self, page: List[dict], archive: zipfile.ZipFile) -> None: nonzero pixels on bboxes only) 3 we clear bboxes from first image 4 and create one image per bbox and save in tmp dir - 5 and finally we return image with bboxes in the proper order + 5 finally we return image with bboxes in the proper order @param page: @param archive: @return: @@ -65,7 +65,7 @@ def add_images(self, page: List[dict], archive: zipfile.ZipFile) -> None: self.logger.info("\nstart image processing") uid_with_images = self._create_images_from_pdf(pdfs=pdfs, page=page, tmp_dir=tmp_dir) for uid, image in uid_with_images: - img_name = "{}.jpg".format(uid) + img_name = f"{uid}.jpg" with tempfile.TemporaryDirectory() as tmpdir: img_path = os.path.join(tmpdir, img_name) image.save(img_path, format="jpeg") @@ -91,7 +91,7 @@ def __create_pair_pdfs(self, docx_archive: zipfile.ZipFile, document: DocxDocume text = re.sub("w:ppr", "w:pPr", text) many_colors_pdf = self.__create_pdf_from_docx(tmp_dir, self.many_colors_file_name, namelist, text) # clear document_bs from border tags - border_tags = document_bs.find_all('w:pbdr') + border_tags = document_bs.find_all("w:pbdr") for tag in border_tags: tag.decompose() # create docx file with bboxes of two interleaving colors @@ -103,9 +103,7 @@ def __create_pair_pdfs(self, docx_archive: zipfile.ZipFile, document: DocxDocume self.logger.info("\nstart image converting") return PairedPdf(many_colors_pdf, two_colors_pdf, used_many_colors, used_two_colors) - def __draw_bboxes(self, - paragraph_list: List[Paragraph], - many_colors: bool) -> Dict[str, int]: + def __draw_bboxes(self, paragraph_list: List[Paragraph], many_colors: bool) -> Dict[str, int]: """ draw bbox in docx document around each paragraph @param paragraph_list: @@ -137,20 +135,17 @@ def __draw_bboxes(self, def _color_from_decimal(decimal_color: int) -> str: color = hex(decimal_color)[2:] if len(color) < 6: - color = '0' * (6 - len(color)) + color + color = "0" * (6 - len(color)) + color return color @staticmethod - def __create_pdf_from_docx(tmp_dir: str, - doc_name: str, - namelist: List[str], - doc_text: str) -> str: - with open('{}/word/document.xml'.format(tmp_dir), 'w') as f: + def __create_pdf_from_docx(tmp_dir: str, doc_name: str, namelist: List[str], doc_text: str) -> str: + with open(f"{tmp_dir}/word/document.xml", "w") as f: f.write(doc_text) - docx_path = "{}/{}.docx".format(tmp_dir, doc_name) - with zipfile.ZipFile(docx_path, mode='w') as new_d: + docx_path = f"{tmp_dir}/{doc_name}.docx" + with zipfile.ZipFile(docx_path, mode="w") as new_d: for filename in namelist: - new_d.write('{}/{}'.format(tmp_dir, filename), arcname=filename) + new_d.write(f"{tmp_dir}/{filename}", arcname=filename) # create pdf file with bbox pdf_name = DocxImagesCreator.__docx2pdf(tmp_dir, docx_path) os.remove(docx_path) @@ -166,36 +161,28 @@ def __await_for_conversion(filename: str) -> None: t += period_checking if t >= timeout: - raise ConversionException( - msg="fail with {filename}".format(filename=filename), - msg_api="Unsupported file format {}".format(filename)) + raise ConversionError(msg=f"fail with {filename}", msg_api=f"Unsupported file format {filename}") @staticmethod - def __docx2pdf(out_dir: str, - path: str) -> str: - os.system("soffice --headless --convert-to pdf {} --outdir {}".format(path, out_dir)) - out_file = '{}/{}pdf'.format(out_dir, os.path.split(path)[-1][:-4]) + def __docx2pdf(out_dir: str, path: str) -> str: + os.system(f"soffice --headless --convert-to pdf {path} --outdir {out_dir}") + out_file = f"{out_dir}/{os.path.split(path)[-1][:-4]}pdf" DocxImagesCreator.__await_for_conversion(out_file) return out_file @staticmethod - def __insert_border(bs_tree: Optional[BeautifulSoup], - color: str) -> None: + def __insert_border(bs_tree: Optional[BeautifulSoup], color: str) -> None: if bs_tree is None: return - border_str = ''.format(color=color) - border_bs = BeautifulSoup(border_str, 'lxml').body.contents[0] + border_str = f'' + border_bs = BeautifulSoup(border_str, "lxml").body.contents[0] if bs_tree.pPr: bs_tree.pPr.insert(1, border_bs) else: - border_bs = BeautifulSoup('' + border_str + '', 'lxml').body.contents[0] + border_bs = BeautifulSoup(f"{border_str}", "lxml").body.contents[0] bs_tree.insert(0, border_bs) @staticmethod @@ -213,10 +200,7 @@ def can_read(self, page: List[dict]) -> bool: image_name = get_original_document_path(self.path2docs, page) return image_name.endswith("docx") - def _create_images_from_pdf(self, - pdfs: PairedPdf, - page: List[dict], - tmp_dir: str) -> Iterable[Tuple[str, Image.Image]]: + def _create_images_from_pdf(self, pdfs: PairedPdf, page: List[dict], tmp_dir: str) -> Iterable[Tuple[str, Image.Image]]: """ we take two paired pdfs with bboxes and create images from them. Then we return images according to page order @@ -245,8 +229,7 @@ def _create_images_from_pdf(self, colors_dict_invert = {} page2color = {line["_uid"]: line.get("color", "#ff0000") for line in page} for uid in pdfs.two_colors: - color = ImageColor.getcolor( - "#{}".format(self._color_from_decimal(pdfs.many_colors[uid] - pdfs.two_colors[uid])), "RGB") + color = ImageColor.getcolor(f"#{self._color_from_decimal(pdfs.many_colors[uid] - pdfs.two_colors[uid])}", "RGB") colors_dict[uid] = color colors_dict_invert[color] = uid assert len(colors_dict) == len(colors_dict_invert) @@ -260,7 +243,7 @@ def _create_images_from_pdf(self, if bbox_color is not None: image_copy = deepcopy(original_image) image_copy[mask] = ImageColor.getcolor(bbox_color, "RGB") - path = "{}/{:06d}.png".format(tmp_dir, n) + path = f"{tmp_dir}/{n:06d}.png" n += 1 uid2path[uid].append(path) Image.fromarray(image_copy).save(path) diff --git a/dedoc/train_dataset/taskers/images_creators/concrete_creators/scanned_images_creator.py b/dedoc/train_dataset/taskers/images_creators/concrete_creators/scanned_images_creator.py index b1c87225..60acbfe2 100644 --- a/dedoc/train_dataset/taskers/images_creators/concrete_creators/scanned_images_creator.py +++ b/dedoc/train_dataset/taskers/images_creators/concrete_creators/scanned_images_creator.py @@ -3,7 +3,7 @@ import zipfile from copy import deepcopy from itertools import zip_longest -from typing import List, Iterator +from typing import Iterator, List import PIL import cv2 @@ -50,7 +50,7 @@ def add_images(self, page: List[dict], archive: zipfile.ZipFile) -> None: else: images = self._create_image_jpg(path=path, page=page) for image, line in zip_longest(images, page): - img_name = "{}.jpg".format(line["_uid"]) + img_name = f"{line['_uid']}.jpg" with tempfile.TemporaryDirectory() as tmpfile: img_path = os.path.join(tmpfile, img_name) image.save(img_path, format="jpeg") @@ -77,7 +77,7 @@ def _create_image_zip(self, path: str, page: List[dict]) -> Iterator[Image]: image = deepcopy(current_image) image_bbox = self._draw_one_bbox(image, line) image_bbox = PIL.Image.fromarray(image_bbox) - image_bbox = image_bbox.convert('RGB') + image_bbox = image_bbox.convert("RGB") yield image_bbox def _create_image_pdf(self, path: str, page: List[dict]) -> Iterator[Image]: @@ -91,7 +91,7 @@ def _create_image_pdf(self, path: str, page: List[dict]) -> Iterator[Image]: image = deepcopy(current_image) image_bbox = self._draw_one_bbox(image, line) image_bbox = PIL.Image.fromarray(image_bbox) - image_bbox = image_bbox.convert('RGB') + image_bbox = image_bbox.convert("RGB") yield image_bbox def _create_image_jpg(self, path: str, page: List[dict]) -> Iterator[Image]: @@ -99,5 +99,5 @@ def _create_image_jpg(self, path: str, page: List[dict]) -> Iterator[Image]: for line in page: image_bbox = self._draw_one_bbox(image, line) image_bbox = PIL.Image.fromarray(image_bbox) - image_bbox = image_bbox.convert('RGB') + image_bbox = image_bbox.convert("RGB") yield image_bbox diff --git a/dedoc/train_dataset/taskers/images_creators/concrete_creators/txt_images_creator.py b/dedoc/train_dataset/taskers/images_creators/concrete_creators/txt_images_creator.py index d911fc0c..2feb3a84 100644 --- a/dedoc/train_dataset/taskers/images_creators/concrete_creators/txt_images_creator.py +++ b/dedoc/train_dataset/taskers/images_creators/concrete_creators/txt_images_creator.py @@ -4,10 +4,10 @@ import zipfile from typing import List, Tuple -from PIL import ImageFont, ImageDraw, Image +from PIL import Image, ImageDraw, ImageFont + from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.txt_reader.raw_text_reader import RawTextReader - from dedoc.train_dataset.taskers.images_creators.concrete_creators.abstract_images_creator import AbstractImagesCreator from dedoc.train_dataset.train_dataset_utils import get_original_document_path from dedoc.utils.utils import get_batch @@ -58,14 +58,13 @@ def add_images(self, page: List[dict], archive: zipfile.ZipFile) -> None: for i, (uid, line) in enumerate(batch_lines): if line[0].isspace(): - self.logger.info( - "\n{} / {} image processed (empty line)".format(i + batch_start + 1, len(txt_lines))) + self.logger.info(f"\n{i + batch_start + 1} / {len(txt_lines)} image processed (empty line)") continue image = self.__make_image(width, height, batch_lines) - self.logger.info("\n{} / {} image processed".format(i + batch_start + 1, len(txt_lines))) + self.logger.info(f"\n{i + batch_start + 1} / {len(txt_lines)} image processed") image_with_bbox = self.__make_bbox_image(image, batch_lines, i) - img_name = "{}.jpg".format(uid) + img_name = f"{uid}.jpg" with tempfile.TemporaryDirectory() as tmpdir: img_path = os.path.join(tmpdir, img_name) image_with_bbox.save(img_path, format="jpeg") @@ -116,13 +115,13 @@ def __make_text_lines(self, page: List[LineWithMeta]) -> List[Tuple[str, List[st return txt_lines def __make_image(self, width: int, height: int, txt_lines: List[Tuple[str, List[str]]]) -> Image: - image = Image.new('RGB', (width, height), color=self.background_color) + image = Image.new("RGB", (width, height), color=self.background_color) draw = ImageDraw.Draw(image) x = self.horizontal_padding y = self.vertical_padding - for i, (uid, line) in enumerate(txt_lines): + for _, (_, line) in enumerate(txt_lines): for part in line: draw.text((x, y), part, font=self.font, fill=self.text_color) y += self.row_height diff --git a/dedoc/train_dataset/taskers/images_creators/image_creator_composition.py b/dedoc/train_dataset/taskers/images_creators/image_creator_composition.py index c85678fb..30e7a2bd 100644 --- a/dedoc/train_dataset/taskers/images_creators/image_creator_composition.py +++ b/dedoc/train_dataset/taskers/images_creators/image_creator_composition.py @@ -3,8 +3,8 @@ from typing import List from dedoc.train_dataset.data_structures.images_archive import ImagesArchive -from dedoc.train_dataset.exceptions.empty_page_exception import EmptyPageException -from dedoc.train_dataset.exceptions.task_creation_exception import TaskCreationException +from dedoc.train_dataset.exceptions.empty_page_error import EmptyPageError +from dedoc.train_dataset.exceptions.task_creation_error import TaskCreationError from dedoc.train_dataset.taskers.images_creators.concrete_creators.abstract_images_creator import AbstractImagesCreator @@ -19,12 +19,12 @@ def create_images(self, pages: List[List[dict]], tmpdir: str) -> ImagesArchive: for page in pages: assert len(page) > 0 if len(pages) == 0: - raise EmptyPageException("You try create task with 0 line in document") + raise EmptyPageError("You try create task with 0 line in document") find_handler = False for creator in self.creators: if creator.can_read(page): creator.add_images(page=page, archive=archive) find_handler = True if not find_handler: - raise TaskCreationException("No one can handle this task, first line = {}".format(pages[0])) + raise TaskCreationError(f"No one can handle this task, first line = {pages[0]}") return ImagesArchive(archive_path) diff --git a/dedoc/train_dataset/taskers/tasker.py b/dedoc/train_dataset/taskers/tasker.py index e8dce2ab..df8bb931 100644 --- a/dedoc/train_dataset/taskers/tasker.py +++ b/dedoc/train_dataset/taskers/tasker.py @@ -3,12 +3,11 @@ import os import random import uuid -from typing import Tuple, Dict, Optional +from typing import Dict, Optional, Tuple from zipfile import ZipFile -from dedoc.train_dataset.exceptions.unknown_task import UnknownTaskException +from dedoc.train_dataset.exceptions.unknown_task_error import UnknownTaskError from dedoc.train_dataset.taskers.concrete_taskers.abstract_tasker import AbstractTasker -import warnings class Tasker(object): @@ -45,15 +44,11 @@ def __init__(self, resources_path = os.path.join(os.path.dirname(__file__), "..", "..", "..", "resources", "train_dataset") self.resources = os.path.abspath(resources_path) - def create_tasks(self, - type_of_task: str, - count_tasks: int = None, - task_size: int = None, - task_uid: Optional[str] = None) -> Tuple[str, int]: + def create_tasks(self, type_of_task: str, count_tasks: int = None, task_size: int = None, task_uid: Optional[str] = None) -> Tuple[str, int]: """ creates subtasks with help of ConcreteTaskers and return common archive of tasks :param type_of_task: task type for calling concrete tasker (for example type_of_task='line_classifier') - :param task_size: size of one task, task should not be large than this. For example number of page. + :param task_size: size of one task, task should not be larger than this. For example number of page. :param count_tasks: number of tasks :param task_uid: unique id of task :return: archive of tasks, task size @@ -63,16 +58,15 @@ def create_tasks(self, if task_size is None and count_tasks is None: raise Exception("Task size undefined") elif count_tasks is not None and task_size is None: - warnings.warn("count_tasks is deprecated, use task_size") + self.logger.warning("count_tasks is deprecated, use task_size") task_size = math.ceil(len(os.listdir(self.images_path)) / count_tasks) elif count_tasks is not None and task_size is not None: - warnings.warn("count_tasks is deprecated, ignore its value and use task_size") + self.logger.warning("count_tasks is deprecated, ignore its value and use task_size") if type_of_task not in self.concrete_taskers: - raise UnknownTaskException(type_of_task) + raise UnknownTaskError(type_of_task) tasker = self.concrete_taskers[type_of_task] - path_to_common_zip = os.path.join(self.save_path, - "{}_{:06d}.zip".format(type_of_task, random.randint(0, 1000000))) + path_to_common_zip = os.path.join(self.save_path, f"{type_of_task}_{random.randint(0, 1000000):06d}.zip") with ZipFile(path_to_common_zip, "w") as tasks_archive: for task_path in tasker.create_tasks(task_size=task_size, diff --git a/dedoc/train_dataset/train_dataset_utils.py b/dedoc/train_dataset/train_dataset_utils.py index 8a3fb790..70ca3725 100644 --- a/dedoc/train_dataset/train_dataset_utils.py +++ b/dedoc/train_dataset/train_dataset_utils.py @@ -3,8 +3,9 @@ import shutil import zipfile from typing import List -import numpy as np + import PIL +import numpy as np from PIL.Image import Image from dedoc.data_structures.line_with_meta import LineWithMeta @@ -41,7 +42,7 @@ def save_page_with_bbox(page: PageWithBBox, document_name: str, *, config: dict) with open(os.path.join(config["intermediate_data_path"], "bboxes.jsonlines"), "a") as out: image = __to_pil(page.image) - image_name = "img_{}_{:06d}.png".format(uid, page.page_num) + image_name = f"img_{uid}_{page.page_num:06d}.png" image.save(os.path.join(images_path, image_name)) for bbox in page.bboxes: bbox_dict = bbox.to_dict() diff --git a/dedoc/train_dataset/trainer/base_sklearn_line_classifier.py b/dedoc/train_dataset/trainer/base_sklearn_line_classifier.py index 55d8c796..a7298adf 100644 --- a/dedoc/train_dataset/trainer/base_sklearn_line_classifier.py +++ b/dedoc/train_dataset/trainer/base_sklearn_line_classifier.py @@ -7,7 +7,8 @@ import pickle from collections import Counter, OrderedDict from statistics import mean -from typing import Optional, List, Callable, Any +from typing import Any, Callable, List, Optional + import numpy as np from sklearn.metrics import accuracy_score from sklearn.model_selection import KFold @@ -23,7 +24,7 @@ class BaseClassifier(XGBClassifier): - def __init__(self, **kwargs: Any) -> None: + def __init__(self, **kwargs: Any) -> None: # noqa super().__init__(**kwargs) @@ -51,12 +52,8 @@ def __init__(self, self.feature_extractor = feature_extractor self.tmp_dir = "/tmp" if tmp_dir is None else tmp_dir url_hash = hashlib.md5(self.data_url.encode()).hexdigest() - self.dataset_dir = os.path.join(self.tmp_dir, "dataset_{}".format(url_hash)) - self.data_loader = DataLoader(dataset_dir=self.dataset_dir, - label_transformer=label_transformer, - logger=logger, - data_url=data_url, - config=config) + self.dataset_dir = os.path.join(self.tmp_dir, f"dataset_{url_hash}") + self.data_loader = DataLoader(dataset_dir=self.dataset_dir, label_transformer=label_transformer, logger=logger, data_url=data_url, config=config) self.random_seed = random_seed self.get_sample_weight = get_sample_weight if get_sample_weight is not None else lambda t: 1 os.makedirs(self.tmp_dir, exist_ok=True) @@ -80,10 +77,7 @@ def __init__(self, self.config = config self.n_splits = n_splits - def fit(self, no_cache: bool = False, - cross_val_only: bool = False, - save: bool = False, - save_errors_images: bool = False) -> None: + def fit(self, no_cache: bool = False, cross_val_only: bool = False, save: bool = False, save_errors_images: bool = False) -> None: data = self.data_loader.get_data(no_cache=no_cache) if save: self.__save(data=data, path=self.tmp_dir) @@ -91,7 +85,7 @@ def fit(self, no_cache: bool = False, logging.info(json.dumps(scores, indent=4)) if not cross_val_only: features = self.feature_extractor.fit_transform(data) - self.logger.info("data train shape {}".format(features.shape)) + self.logger.info(f"data train shape {features.shape}") n = features.shape[0] // 10 features_train, features_test = features[:-n], features[-n:] labels = self.__get_labels(data) @@ -112,7 +106,7 @@ def fit(self, no_cache: bool = False, pickle.dump((cls, self.feature_extractor.parameters()), output_file) if self.path_scores is not None: - self.logger.info("Save scores in {}".format(self.path_scores)) + self.logger.info(f"Save scores in {self.path_scores}") os.makedirs(os.path.dirname(self.path_scores), exist_ok=True) with open(self.path_scores, "w") as file: json.dump(obj=scores, fp=file, indent=4) @@ -120,7 +114,7 @@ def fit(self, no_cache: bool = False, os.makedirs(os.path.dirname(self.path_features_importances), exist_ok=True) self._save_features_importances(cls, features_train.columns) - def _save_features_importances(self, cls: Any, feature_names: List[str]) -> None: + def _save_features_importances(self, cls: Any, feature_names: List[str]) -> None: # noqa pass def __save(self, data: List[List[LineWithLabel]], path: str = "/tmp", csv_only: bool = False) -> str: @@ -135,13 +129,9 @@ def __save(self, data: List[List[LineWithLabel]], path: str = "/tmp", csv_only: features_train[uid_name] = [line.uid for line in flatten(data)] text_name = "text" features_train[text_name] = [line.line for line in flatten(data)] - dataset = LineClassifierDataset(dataframe=features_train, - feature_list=features_list, - group_name=group_name, - label_name=label_name, - text_name=text_name) + dataset = LineClassifierDataset(dataframe=features_train, feature_list=features_list, group_name=group_name, label_name=label_name, text_name=text_name) path = dataset.save(path, csv_only=csv_only) - self.logger.info("Save dataset into {}".format(path)) + self.logger.info(f"Save dataset into {path}") return path @abc.abstractmethod @@ -151,13 +141,13 @@ def _get_classifier(self) -> BaseClassifier: def _cross_val(self, data: List[List[LineWithLabel]], save_errors_images: bool) -> dict: error_cnt = Counter() errors_uids = [] - os.system("rm -rf {}/*".format(self.path_errors)) + os.system(f"rm -rf {self.path_errors}/*") os.makedirs(self.path_errors, exist_ok=True) scores = [] data = np.array(data, dtype=object) kf = KFold(n_splits=self.n_splits) - for iteration, (train_index, val_index) in tqdm(enumerate(kf.split(data)), total=self.n_splits): + for train_index, val_index in tqdm(kf.split(data), total=self.n_splits): data_train, data_val = data[train_index].tolist(), data[val_index].tolist() labels_train = self.__get_labels(data_train) labels_val = self.__get_labels(data_val) @@ -166,8 +156,7 @@ def _cross_val(self, data: List[List[LineWithLabel]], save_errors_images: bool) if features_train.shape[1] != features_val.shape[1]: val_minus_train = set(features_val.columns) - set(features_train.columns) train_minus_val = set(features_val.columns) - set(features_train.columns) - msg = "some features in train, but not in val {}\nsome features in val, but not in train {}".format( - val_minus_train, train_minus_val) + msg = f"some features in train, but not in val {val_minus_train}\nsome features in val, but not in train {train_minus_val}" raise ValueError(msg) cls = self._get_classifier() sample_weight = [self.get_sample_weight(line) for line in flatten(data_train)] @@ -177,7 +166,7 @@ def _cross_val(self, data: List[List[LineWithLabel]], save_errors_images: bool) if y_true != y_pred: error_cnt[(y_true, y_pred)] += 1 errors_uids.append(line.uid) - with open(os.path.join(self.path_errors, "{}_{}.txt".format(y_true, y_pred)), "a") as file: + with open(os.path.join(self.path_errors, f"{y_true}_{y_pred}.txt"), "a") as file: result = OrderedDict() result["text"] = line.line result["uid"] = line.uid diff --git a/dedoc/train_dataset/trainer/data_loader.py b/dedoc/train_dataset/trainer/data_loader.py index 1c2ba30f..6be8d780 100644 --- a/dedoc/train_dataset/trainer/data_loader.py +++ b/dedoc/train_dataset/trainer/data_loader.py @@ -3,9 +3,10 @@ import os import pickle import zipfile -from collections import defaultdict, Counter +from collections import Counter, defaultdict from tempfile import TemporaryDirectory -from typing import List, Callable, Tuple +from typing import Callable, List, Tuple + import numpy as np import pandas as pd import wget @@ -18,13 +19,7 @@ class DataLoader: - def __init__(self, - dataset_dir: str, - label_transformer: Callable[[str], str], - logger: logging.Logger, - data_url: str, - *, - config: dict) -> None: + def __init__(self, dataset_dir: str, label_transformer: Callable[[str], str], logger: logging.Logger, data_url: str, *, config: dict) -> None: self.label_transformer = label_transformer self.dataset_dir = dataset_dir self.logger = logger @@ -50,7 +45,7 @@ def get_data(self, no_cache: bool = False) -> List[List[LineWithLabel]]: self.logger.info("Finish download dataset") with TemporaryDirectory() as tmp_dir: - with zipfile.ZipFile(path_out, 'r') as zip_ref: + with zipfile.ZipFile(path_out, "r") as zip_ref: zip_ref.extractall(tmp_dir) metadata_extractor = LineWithMetaExtractor( path=os.path.join(tmp_dir, "labeled.json"), diff --git a/dedoc/train_dataset/trainer/dataset.py b/dedoc/train_dataset/trainer/dataset.py index f0fd5af4..75233a52 100644 --- a/dedoc/train_dataset/trainer/dataset.py +++ b/dedoc/train_dataset/trainer/dataset.py @@ -8,14 +8,8 @@ class LineClassifierDataset: - def __init__(self, - dataframe: pd.DataFrame, - feature_list: List[str], - group_name: str, - label_name: str, - text_name: str) -> None: + def __init__(self, dataframe: pd.DataFrame, feature_list: List[str], group_name: str, label_name: str, text_name: str) -> None: """ - @param dataframe: pandas dataframe with features and metadata @param feature_list: list of feature columns name @param group_name: name of group column (for example "document") @@ -40,15 +34,11 @@ def save(self, path: str = "/tmp", csv_only: bool = False) -> str: if csv_only: self.dataframe.to_csv(os.path.join(path, "dataset.csv")) return path - dir_out = os.path.join(path, "dataset_{}".format(int(time.time() * 1000))) + dir_out = os.path.join(path, f"dataset_{int(time.time() * 1000)}") os.mkdir(dir_out) self.dataframe.to_csv(os.path.join(dir_out, "dataset.csv")) self.dataframe.to_pickle(os.path.join(dir_out, "dataset.pkl.gz")) with open(os.path.join(dir_out, "description.json"), "w") as out: - d = dict( - label_name=self.label_name, - group_name=self.group_name, - text_name=self.text_name, - feature_list=self.feature_list) + d = dict(label_name=self.label_name, group_name=self.group_name, text_name=self.text_name, feature_list=self.feature_list) json.dump(obj=d, fp=out, ensure_ascii=False, indent=4) return dir_out diff --git a/dedoc/train_dataset/trainer/errors_saver.py b/dedoc/train_dataset/trainer/errors_saver.py index d4708235..af90ba52 100644 --- a/dedoc/train_dataset/trainer/errors_saver.py +++ b/dedoc/train_dataset/trainer/errors_saver.py @@ -28,17 +28,14 @@ def __init__(self, errors_path: str, dataset_path: str, logger: logging.Logger, self.dataset_path = os.path.join(dataset_path, "dataset.zip") self.config = config - def save_errors(self, error_cnt: Counter, - errors_uids: List[str], - csv_path: str, - save_errors_images: bool = False) -> None: + def save_errors(self, error_cnt: Counter, errors_uids: List[str], csv_path: str, save_errors_images: bool = False) -> None: assert len(set(errors_uids)) == len(errors_uids) - self.logger.info("save errors in {}".format(self.errors_path)) + self.logger.info(f"save errors in {self.errors_path}") errors_total_num = sum(error_cnt.values()) - print("{:16s} -> {:16s} {:6s} {:16s}".format("true", "predicted", "cnt", "(percent)")) # noqa + print(f"{'true':16s} -> {'predicted':16s} {'cnt':6s} {'(percent)':16s}") # noqa for error, cnt in error_cnt.most_common(): y_true, y_pred = error - print("{:16s} -> {:16s} {:06,} ({:02.2f}%)".format(y_true, y_pred, cnt, 100 * cnt / errors_total_num)) # noqa + print(f"{y_true:16s} -> {y_pred:16s} {cnt:06,} ({100 * cnt / errors_total_num:02.2f}%)") # noqa if save_errors_images: self.__save_images(errors_uids, csv_path) @@ -50,10 +47,7 @@ def save_errors(self, error_cnt: Counter, lines = file.readlines() lines_cnt = Counter(lines) lines.sort(key=lambda l: (-lines_cnt[l], l)) - path_out = os.path.join(self.errors_path, "{:04d}_{}".format( - int(1000 * len(lines) / errors_total_num), - file_name - )) + path_out = os.path.join(self.errors_path, f"{int(1000 * len(lines) / errors_total_num):04d}_{file_name}") with open(path_out, "w") as file_out: for line in lines: @@ -79,7 +73,7 @@ def __save_images(self, errors_uids: List[str], csv_path: str) -> None: if not os.path.isfile(self.dataset_path) or not os.path.isfile(csv_dataset_path): return with tempfile.TemporaryDirectory() as documents_tmp_dir: - with zipfile.ZipFile(self.dataset_path, 'r') as dataset_archive: + with zipfile.ZipFile(self.dataset_path, "r") as dataset_archive: dataset_archive.extractall(documents_tmp_dir) path2docs = os.path.join(documents_tmp_dir, "original_documents") images_creators = [ScannedImagesCreator(path2docs=path2docs), @@ -92,13 +86,14 @@ def __save_images(self, errors_uids: List[str], csv_path: str) -> None: ready_documents, ready_images = self.__prepare_files() - with zipfile.ZipFile(self.images_archive, 'a') as images_archive, \ - zipfile.ZipFile(self.errors_images_archive, 'w') as errors_images_archive: + with zipfile.ZipFile(self.images_archive, "a") as images_archive, \ + zipfile.ZipFile(self.errors_images_archive, "w") as errors_images_archive: for uid in tqdm(errors_uids): self.__process_uid(errors_images_archive, filtered_dataset, images_archive, images_creators, ready_documents, ready_images, uid) - def __process_uid(self, errors_images_archive: zipfile.ZipFile, + def __process_uid(self, + errors_images_archive: zipfile.ZipFile, filtered_dataset: pd.DataFrame, images_archive: zipfile.ZipFile, images_creators: List[AbstractImagesCreator], @@ -107,7 +102,7 @@ def __process_uid(self, errors_images_archive: zipfile.ZipFile, uid: str) -> None: done_set = set() document_name = filtered_dataset[filtered_dataset.uid == uid].head(1).group.item() - img_name = "{}.jpg".format(uid) + img_name = f"{uid}.jpg" if img_name in done_set: return # skip done image done_set.add(img_name) @@ -136,11 +131,11 @@ def __prepare_files(self) -> Tuple[List[str], List[str]]: with open(self.errors_documents, "w") as json_file: json.dump([], json_file) if not os.path.isfile(self.images_archive): - with zipfile.ZipFile(self.images_archive, 'w'): + with zipfile.ZipFile(self.images_archive, "w"): ready_images = [] ready_documents = [] else: - with zipfile.ZipFile(self.images_archive, 'r') as images_archive: + with zipfile.ZipFile(self.images_archive, "r") as images_archive: ready_images = images_archive.namelist() with open(self.errors_documents, "r") as json_file: ready_documents = json.load(json_file) diff --git a/dedoc/train_dataset/trainer/line_lstm_classifier_trainer.py b/dedoc/train_dataset/trainer/line_lstm_classifier_trainer.py index bba48e7c..a0f3bd38 100644 --- a/dedoc/train_dataset/trainer/line_lstm_classifier_trainer.py +++ b/dedoc/train_dataset/trainer/line_lstm_classifier_trainer.py @@ -1,21 +1,22 @@ import hashlib import json -import os import logging -from typing import Callable, List, Iterator, Optional, Dict, Any -from collections import OrderedDict -from torch.autograd import Variable -import torch -from torch import nn +import os import time +from collections import OrderedDict +from statistics import mean +from typing import Any, Callable, Dict, Iterator, List, Optional + import numpy as np +import pandas as pd +import torch from sklearn.model_selection import KFold +from torch import nn +from torch.autograd import Variable from torch.nn.modules.loss import CrossEntropyLoss from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from torch.optim.optimizer import Optimizer from tqdm import tqdm -from statistics import mean -import pandas as pd from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor from dedoc.train_dataset.data_structures.line_with_label import LineWithLabel @@ -45,10 +46,7 @@ def forward(self, x: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor: feature_dim = self.feature_dim step_dim = self.step_dim - eij = torch.mm( - x.contiguous().view(-1, feature_dim), - self.weight - ).view(-1, step_dim) + eij = torch.mm(x.contiguous().view(-1, feature_dim), self.weight).view(-1, step_dim) if self.bias: eij = eij + self.b @@ -81,11 +79,7 @@ def __init__(self, input_dim: int, hidden_dim: int, hidden_dim_2: int, num_class # batch_first: If ``True``, then the input and output tensors are provided # as (batch, seq, feature). # with_attention - use or not Attention NN - self.lstm = nn.LSTM(input_size=input_dim, - hidden_size=hidden_dim, - num_layers=lstm_layers, - bidirectional=bidirectional, - batch_first=True) + self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=lstm_layers, bidirectional=bidirectional, batch_first=True) self.lstm_attention = Attention(hidden_dim * 2, maxlen).to(device) num_directions = 2 if bidirectional else 1 self.fc1 = nn.Linear(hidden_dim * num_directions, hidden_dim_2) @@ -102,8 +96,7 @@ def init_hidden(self, batch_size: int, device: torch.device) -> [torch.Tensor, t Variable(torch.zeros(self.lstm_layers * self.num_directions, batch_size, self.lstm_units)).to(device)) return h, c - def forward(self, input_tensor: List[torch.Tensor], batch_lengths: torch.Tensor, device: torch.device) \ - -> torch.Tensor: + def forward(self, input_tensor: List[torch.Tensor], batch_lengths: torch.Tensor, device: torch.device) -> torch.Tensor: batch_size = len(input_tensor) h_0, c_0 = self.init_hidden(batch_size, device) packed_embedded = pack_padded_sequence(input_tensor, batch_lengths, batch_first=True) @@ -111,8 +104,8 @@ def forward(self, input_tensor: List[torch.Tensor], batch_lengths: torch.Tensor, output_unpacked, output_lengths = pad_packed_sequence(output, batch_first=True) if self.with_attention: - '''att_mask = torch.tensor(np.array([0.3, 0.3, 0.5, 1.0, 0.5, 0.3, 0.3]), - dtype=torch.float32, device=torch.device("cuda:0"))''' + """att_mask = torch.tensor(np.array([0.3, 0.3, 0.5, 1.0, 0.5, 0.3, 0.3]), + dtype=torch.float32, device=torch.device("cuda:0"))""" att_mask = None res_nn = self.lstm_attention(output_unpacked, mask=att_mask) else: @@ -151,12 +144,8 @@ def __init__(self, self.logger = logger self.tmp_dir = "/tmp" if tmp_dir is None else tmp_dir url_hash = hashlib.md5(self.data_url.encode()).hexdigest() - self.dataset_dir = os.path.join(self.tmp_dir, "dataset_{}".format(url_hash)) - self.data_loader = DataLoader(dataset_dir=self.dataset_dir, - label_transformer=label_transformer, - logger=logger, - data_url=data_url, - config=config) + self.dataset_dir = os.path.join(self.tmp_dir, f"dataset_{url_hash}") + self.data_loader = DataLoader(dataset_dir=self.dataset_dir, label_transformer=label_transformer, logger=logger, data_url=data_url, config=config) self.path_scores = path_scores self.feature_extractor = feature_extractor self.class_dict = class_dict @@ -172,15 +161,14 @@ def __get_labels(self, data: List[List[LineWithLabel]]) -> List[str]: result = [line.label for line in flatten(data)] return result - def get_features_with_separing(self, data: List[List[LineWithLabel]]) \ - -> [pd.DataFrame, pd.DataFrame, List[str], List[str]]: + def get_features_with_separing(self, data: List[List[LineWithLabel]]) -> [pd.DataFrame, pd.DataFrame, List[str], List[str]]: features = self.feature_extractor.fit_transform(data) n = features.shape[0] // 10 features_train, features_test = features[n:], features[:n] labels = self.__get_labels(data) labels_train, labels_test = labels[n:], labels[:n] - self.logger.info("data train shape {}".format(features_train.shape)) - self.logger.info("data test shape {}".format(features_test.shape)) + self.logger.info(f"data train shape {features_train.shape}") + self.logger.info(f"data test shape {features_test.shape}") return features_train, features_test, labels_train, labels_test @@ -206,23 +194,19 @@ def training_and_evaluation_process(self, lstm_model: nn.Module, optimizer: Opti loader_iter = iter(LineEpsDataSet(features_train, labels_train, self.class_dict)) time_begin = time.time() - train_loss, train_acc = self.train(lstm_model, loader_iter, len(labels_train), - optimizer, criteria, batch_size=self.batch_size) + train_loss, train_acc = self.train(lstm_model, loader_iter, len(labels_train), optimizer, criteria, batch_size=self.batch_size) time_epoch += time.time() - time_begin - print("\n\t \x1b\33[33mTrain: epoch: {}| Train loss: {} | Train acc: {}\x1b[0m".format(epoch, train_loss, # noqa - train_acc)) + print(f"\n\t \x1b\33[33mTrain: epoch: {epoch}| Train loss: {train_loss} | Train acc: {train_acc}\x1b[0m") # noqa if file_log: - file_log.write("\t Train: epoch: {}| Train loss: {} | Train acc: {}\n".format(epoch, epoch, train_loss)) + file_log.write(f"\t Train: epoch: {epoch}| Train loss: {epoch} | Train acc: {train_loss}\n") # Evaluation if with_eval: loader_iter = iter(LineEpsDataSet(features_test, labels_test, self.class_dict)) - test_loss, test_acc = self.evaluate(lstm_model, loader_iter, len(labels_test), criteria, - batch_size=self.batch_size) - print("\n\t \x1b\33[92mEvaluation: Test loss: {} | Test acc: {}\x1b[0m".format(test_loss, test_acc)) # noqa + test_loss, test_acc = self.evaluate(lstm_model, loader_iter, len(labels_test), criteria, batch_size=self.batch_size) + print(f"\n\t \x1b\33[92mEvaluation: Test loss: {test_loss} | Test acc: {test_acc}\x1b[0m") # noqa if file_log: - file_log.write( - "\t Eval: epoch: {}| Test loss: {} | Test acc: {}\n".format(epoch, test_loss, test_acc)) + file_log.write(f"\t Eval: epoch: {epoch}| Test loss: {test_loss} | Test acc: {test_acc}\n") curr_loss = test_loss res_acc += test_acc res_loss += test_loss @@ -235,7 +219,7 @@ def training_and_evaluation_process(self, lstm_model: nn.Module, optimizer: Opti if with_save and curr_loss < best_loss: best_loss = curr_loss torch.save(lstm_model.state_dict(), self.path_out) - print("Model has been saved into {}".format(self.path_out)) # noqa + print(f"Model has been saved into {self.path_out}") # noqa return res_loss / self.num_epochs, res_acc / self.num_epochs, time_epoch / self.num_epochs @@ -265,16 +249,15 @@ def fit(self, with_cross_val: bool = True) -> None: lstm_model = LSTM(input_dim=features_train.shape[1], hidden_dim=features_train.shape[1], hidden_dim_2=lstm_hidden_dim, num_classes=self.num_classes, lstm_layers=lstm_layers, - bidirectional=self.bi_directional, dropout=lstm_drop_out, device=self.device) \ - .to(self.device) + bidirectional=self.bi_directional, dropout=lstm_drop_out, device=self.device).to(self.device) optimizer = torch.optim.Adam(lstm_model.parameters(), lr=self.lr) - logfile_kfold_tmp.write("\t KFold iter = {}\n".format(iteration)) + logfile_kfold_tmp.write(f"\t KFold iter = {iteration}\n") loss, acc, epoch_sec = self.training_and_evaluation_process(lstm_model, optimizer, criteria, features_train, labels_train, features_test, labels_test, file_log=logfile_kfold_tmp, with_save=False, with_eval=True) - logfile_kfold_tmp.write("\t time_epoch_(sec)={}\n".format(epoch_sec)) + logfile_kfold_tmp.write(f"\t time_epoch_(sec)={epoch_sec}\n") logfile_kfold_tmp.flush() scores.append(acc) epoch_time.append(epoch_sec) @@ -299,7 +282,7 @@ def fit(self, with_cross_val: bool = True) -> None: scores_dict["final_accuracy"] = acc if self.path_scores is not None: - self.logger.info("Save scores in {}".format(self.path_scores)) + self.logger.info(f"Save scores in {self.path_scores}") os.makedirs(os.path.basename(self.path_scores), exist_ok=True) with open(self.path_scores, "w") as file: json.dump(obj=scores_dict, fp=file, indent=4) @@ -322,7 +305,7 @@ def _evaluate_batch_num(self, cnt_data: int, batch_size: int) -> [int, int]: def _get_batch_data(self, curr_batch_size: int, iterator: Iterator) -> [List[Iterator], List[int], List[str]]: batch_features, batch_lens, labels = [], [], [] - for num in range(curr_batch_size): + for _ in range(curr_batch_size): curr = next(iterator) batch_features.append(curr[0]) batch_lens.append(curr[2]) @@ -330,8 +313,7 @@ def _get_batch_data(self, curr_batch_size: int, iterator: Iterator) -> [List[Ite return batch_features, batch_lens, labels - def train(self, model: nn.Module, iterator: Iterator, cnt_data: int, optimizer: Optimizer, - criteria: CrossEntropyLoss, batch_size: int) -> [float, float]: + def train(self, model: nn.Module, iterator: Iterator, cnt_data: int, optimizer: Optimizer, criteria: CrossEntropyLoss, batch_size: int) -> [float, float]: epoch_loss = 0 epoch_acc = 0 cnt = 0 @@ -358,12 +340,11 @@ def train(self, model: nn.Module, iterator: Iterator, cnt_data: int, optimizer: epoch_acc += accuracy cnt += 1 if log_per_cnt != 0 and batch_num % log_per_cnt == 0: - print("\t\tbatch_num: {}, loss={}, acc={}".format(batch_num, epoch_loss / cnt, epoch_acc / cnt)) # noqa + print(f"\t\tbatch_num: {batch_num}, loss={epoch_loss / cnt}, acc={epoch_acc / cnt}") # noqa return epoch_loss / cnt, epoch_acc / cnt - def evaluate(self, model: nn.Module, iterator: Iterator, cnt_data: int, criteria: CrossEntropyLoss, - batch_size: int) -> [float, float]: + def evaluate(self, model: nn.Module, iterator: Iterator, cnt_data: int, criteria: CrossEntropyLoss, batch_size: int) -> [float, float]: epoch_loss = 0 epoch_acc = 0 cnt = 0 diff --git a/dedoc/train_dataset/trainer/logreg_line_classifier_trainer.py b/dedoc/train_dataset/trainer/logreg_line_classifier_trainer.py index 65facba3..a98bc51d 100644 --- a/dedoc/train_dataset/trainer/logreg_line_classifier_trainer.py +++ b/dedoc/train_dataset/trainer/logreg_line_classifier_trainer.py @@ -1,5 +1,6 @@ import logging -from typing import Optional, Callable +from typing import Callable, Optional + from sklearn.linear_model import LogisticRegression from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor diff --git a/dedoc/train_dataset/trainer/xgboost_line_classifier_trainer.py b/dedoc/train_dataset/trainer/xgboost_line_classifier_trainer.py index fdf1de74..4ba65d22 100644 --- a/dedoc/train_dataset/trainer/xgboost_line_classifier_trainer.py +++ b/dedoc/train_dataset/trainer/xgboost_line_classifier_trainer.py @@ -1,5 +1,6 @@ import logging -from typing import Optional, List, Callable +from typing import Callable, List, Optional + import xgbfir from xgboost import XGBClassifier @@ -34,5 +35,4 @@ def _get_classifier(self) -> XGBClassifier: return XGBClassifier(random_state=self.random_seed, **self.classifier_parameters) def _save_features_importances(self, cls: XGBClassifier, feature_names: List[str]) -> None: - xgbfir.saveXgbFI(cls, feature_names=feature_names, - OutputXlsxFile=self.path_features_importances) + xgbfir.saveXgbFI(cls, feature_names=feature_names, OutputXlsxFile=self.path_features_importances) diff --git a/dedoc/utils/annotation_merger.py b/dedoc/utils/annotation_merger.py index c9bdee84..cc04d31e 100644 --- a/dedoc/utils/annotation_merger.py +++ b/dedoc/utils/annotation_merger.py @@ -1,6 +1,6 @@ import re from collections import defaultdict -from typing import List, Dict, Union, Optional +from typing import Dict, List, Optional, Union from dedoc.data_structures.annotation import Annotation @@ -62,7 +62,7 @@ def merge(self) -> Optional[Annotation]: class AnnotationMerger: - spaces = re.compile(r'\s+') + spaces = re.compile(r"\s+") def merge_annotations(self, annotations: List[Annotation], text: str) -> List[Annotation]: """ diff --git a/dedoc/utils/image_utils.py b/dedoc/utils/image_utils.py index 699cde86..1153a47c 100644 --- a/dedoc/utils/image_utils.py +++ b/dedoc/utils/image_utils.py @@ -1,10 +1,11 @@ -from typing import Tuple, List +from copy import deepcopy +from typing import List, Tuple + import PIL import cv2 import numpy as np -from scipy.ndimage import maximum_filter -from copy import deepcopy from PIL import Image, ImageDraw +from scipy.ndimage import maximum_filter from dedoc.data_structures.bbox import BBox @@ -27,16 +28,12 @@ def get_bbox_from_image(image: Image, bbox: BBox, resize: Tuple[int, int] = (300 return image as is @return: PIL image """ - rectangle = (bbox.x_top_left, - bbox.y_top_left, - bbox.x_bottom_right, - bbox.y_bottom_right - ) + rectangle = (bbox.x_top_left, bbox.y_top_left, bbox.x_bottom_right, bbox.y_bottom_right) if isinstance(image, np.ndarray): image = PIL.Image.fromarray(image) cropped = image.crop(rectangle) if resize is not None: - cropped = cropped.resize((300, 15)).convert('RGB') + cropped = cropped.resize((300, 15)).convert("RGB") return cropped @@ -57,9 +54,7 @@ def rotate_image(image: np.ndarray, angle: float, color_bound: Tuple[int, int, i rotation_mat[0, 2] += bound_w / 2 - image_center[0] rotation_mat[1, 2] += bound_h / 2 - image_center[1] - rotated_mat = cv2.warpAffine(image, rotation_mat, (bound_w, bound_h), - borderMode=cv2.BORDER_CONSTANT, - borderValue=color_bound) + rotated_mat = cv2.warpAffine(image, rotation_mat, (bound_w, bound_h), borderMode=cv2.BORDER_CONSTANT, borderValue=color_bound) return rotated_mat @@ -86,12 +81,7 @@ def crop_image_text(image: np.ndarray) -> np.ndarray: return image_crop -def draw_rectangle(image: PIL.Image, - x_top_left: int, - y_top_left: int, - width: int, - height: int, - color: Tuple[int, int, int] = (0, 0, 0)) -> np.ndarray: +def draw_rectangle(image: PIL.Image, x_top_left: int, y_top_left: int, width: int, height: int, color: Tuple[int, int, int] = (0, 0, 0)) -> np.ndarray: if color == "black": color = (0, 0, 0) source_img = deepcopy(image).convert("RGBA") @@ -111,7 +101,7 @@ def get_concat_v(images: List[Image.Image]) -> Image: return images[0] width = max((image.width for image in images)) height = sum((image.height for image in images)) - dst = Image.new('RGB', (width, height)) + dst = Image.new("RGB", (width, height)) height = 0 for image in images: dst.paste(image, (0, height)) diff --git a/dedoc/utils/parameter_utils.py b/dedoc/utils/parameter_utils.py index 1e623359..e15d0015 100644 --- a/dedoc/utils/parameter_utils.py +++ b/dedoc/utils/parameter_utils.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict, Any, Tuple +from typing import Any, Dict, Optional, Tuple def get_param_language(parameters: Optional[dict]) -> str: @@ -122,4 +122,4 @@ def get_param_page_slice(parameters: Dict[str, Any]) -> Tuple[Optional[int], Opt return first_page, last_page except Exception: - raise ValueError("Error input parameter 'pages'. Bad page limit {}".format(pages)) + raise ValueError(f"Error input parameter 'pages'. Bad page limit {pages}") diff --git a/dedoc/utils/pdf_utils.py b/dedoc/utils/pdf_utils.py index c93884a9..68bfa9a6 100644 --- a/dedoc/utils/pdf_utils.py +++ b/dedoc/utils/pdf_utils.py @@ -7,7 +7,7 @@ def get_pdf_page_count(path: str) -> Optional[int]: try: - with open(path, 'rb') as fl: + with open(path, "rb") as fl: reader = PdfFileReader(fl) return reader.getNumPages() except Exception: diff --git a/dedoc/utils/utils.py b/dedoc/utils/utils.py index e9f4b9d5..c8f74605 100644 --- a/dedoc/utils/utils.py +++ b/dedoc/utils/utils.py @@ -10,7 +10,7 @@ import shutil import time from os.path import splitext -from typing import List, Optional, TypeVar, Tuple, Iterable, Iterator, Dict, Any +from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, TypeVar import requests from Levenshtein._levenshtein import ratio @@ -44,7 +44,7 @@ def identity(x: T) -> T: def get_batch(size: int, iterable: Iterator[T]) -> Iterator[List[T]]: """ - it is batch generator. Generating batch with 'size'. Last batch can be less then size or equals [] + it is batch generator. Generating batch with "size". Last batch can be less then size or equals [] :param size: batch size :param iterable: input data iterator :return: iterator of element of current batch @@ -63,8 +63,8 @@ def splitext_(path: str) -> Tuple[str, str]: """ get extensions with several dots """ - if len(path.split('.')) > 2: - return path.split('.')[0], '.' + '.'.join(path.split('.')[-2:]) + if len(path.split(".")) > 2: + return path.split(".")[0], "." + ".".join(path.split(".")[-2:]) return splitext(path) @@ -90,7 +90,7 @@ def get_unique_name(filename: str) -> str: _, ext = splitext_(filename) ts = int(time.time()) rnd = random.randint(0, 1000) - return str(ts) + '_' + str(rnd) + ext + return str(ts) + "_" + str(rnd) + ext def save_upload_file(upload_file: UploadFile, output_dir: str) -> str: @@ -122,7 +122,7 @@ def save_data_to_unique_file(directory: str, filename: str, binary_data: bytes) def get_file_mime_type(path: str) -> str: - return mimetypes.guess_type(path)[0] or 'application/octet-stream' + return mimetypes.guess_type(path)[0] or "application/octet-stream" def get_extensions_by_mime(mime: str) -> List[str]: @@ -136,7 +136,7 @@ def get_extensions_by_mimes(mimes: List[str]) -> List[str]: return exts -def special_match(strg: str, regular_pattern: str = r'[^.?!,:;"\'\n\r ]') -> bool: +def special_match(strg: str, regular_pattern: str = r"[^.?!,:;'\"\n\r ]") -> bool: """ checks if a string only contains certain characters """ @@ -231,12 +231,12 @@ def check_filename_length(filename: str) -> str: def send_file(host: str, file_name: str, file_path: str, parameters: dict) -> Dict[str, Any]: - with open(file_path, 'rb') as file: + with open(file_path, "rb") as file: # file we want to parse - files = {'file': (file_name, file)} + files = {"file": (file_name, file)} # dict with additional parameters # and now we send post request with attached file and parameters. - r = requests.post("{}/upload".format(host), files=files, data=parameters) + r = requests.post(f"{host}/upload", files=files, data=parameters) # wait for response, parse json result and print it assert r.status_code == 200 result = json.loads(r.content.decode()) diff --git a/pyproject.toml b/pyproject.toml index be2e0159..5cadd5ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,19 @@ docs = [ "sphinx_rtd_theme==1.2.0", # for using sphinx_rtd_theme "linuxdoc==20230506" # for using flat-table ] +lint = [ + "flake8==5.0.4", + "flake8-absolute-import==1.0.0.1", + "flake8-annotations==2.9.1", + "flake8-bugbear==23.3.12", + "flake8-builtins==2.1.0", + "flake8-import-order==0.18.2", + "flake8-print==5.0.0", + "flake8-quotes==3.3.2", + "flake8-use-fstring==1.4", + "pycodestyle==2.9.0", + "pep8-naming==0.13.3" +] [project.scripts] dedoc = "dedoc.main:main" diff --git a/requirements.txt b/requirements.txt index 2f4ace74..1148f6b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ Cython==0.29.28 -flake8==3.9.2 -pyflakes==2.3.0 beautifulsoup4==4.10.0 +catboost==1.2 charset-normalizer==2.0.12 docx==0.2.4 huggingface-hub==0.14.1 diff --git a/tests/api_tests/abstract_api_test.py b/tests/api_tests/abstract_api_test.py index 077750be..579f26ea 100644 --- a/tests/api_tests/abstract_api_test.py +++ b/tests/api_tests/abstract_api_test.py @@ -1,6 +1,8 @@ import json import os + import requests + from dedoc.utils.utils import similarity as utils_similarity from tests.api_tests.content_checker import ContentChecker from tests.test_utils import tree2linear @@ -14,15 +16,15 @@ def _check_similarity(self, actual: str, expected: str, threshold: float = 0.8) self.assertEqual(expected, actual) def _check_metainfo(self, metainfo: dict, actual_type: str, actual_name: str) -> None: - self.assertEqual(metainfo['file_type'], actual_type) - self.assertEqual(metainfo['file_name'], actual_name) + self.assertEqual(metainfo["file_type"], actual_type) + self.assertEqual(metainfo["file_name"], actual_name) def _get_host(self) -> str: - host = os.environ.get('DOC_READER_HOST', 'localhost') + host = os.environ.get("DOC_READER_HOST", "localhost") return host def _get_port(self) -> int: - port = int(os.environ.get('DOCREADER_PORT', '1231')) + port = int(os.environ.get("DOCREADER_PORT", "1231")) return port def _get_abs_path(self, file_name: str) -> str: @@ -45,9 +47,9 @@ def _send_request(self, file_name: str, data: dict = None, expected_code: int = port = self._get_port() abs_path = self._get_abs_path(file_name) - with open(abs_path, 'rb') as file: - files = {'file': (file_name, file)} - r = requests.post("http://{host}:{port}/upload".format(host=host, port=port), files=files, data=data) + with open(abs_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post(f"http://{host}:{port}/upload", files=files, data=data) self.assertEqual(expected_code, r.status_code) if expected_code != 200: return r.content.decode() @@ -63,7 +65,7 @@ def _send_request_wo_file(self, data: dict = None, expected_code: int = 200) -> if data is None: data = {} - r = requests.post("http://{host}:{port}/upload".format(host=host, port=port), data=data) + r = requests.post(f"http://{host}:{port}/upload", data=data) self.assertEqual(expected_code, r.status_code) if expected_code != 200: @@ -74,7 +76,7 @@ def _send_request_wo_file(self, data: dict = None, expected_code: int = 200) -> def _test_table_refs(self, content: dict) -> None: tree = content["structure"] - tables = content['tables'] + tables = content["tables"] lines = tree2linear(tree) annotations = [] for line in lines: diff --git a/tests/api_tests/content_checker.py b/tests/api_tests/content_checker.py index 025397ee..10741aed 100644 --- a/tests/api_tests/content_checker.py +++ b/tests/api_tests/content_checker.py @@ -84,8 +84,7 @@ def _check_english_doc(self, result: dict) -> None: self.assertEqual("THE GREAT ENGLISH DOCUMENT", structure["subparagraphs"][0]["text"].strip()) list_elements = structure["subparagraphs"][1]["subparagraphs"] self.assertEqual("1) Fisrst item with some english text", list_elements[0]["text"].strip()) - self.assertEqual("2) Second item with some even more inglish text. Let me speek from my heart", - list_elements[1]["text"].strip()) + self.assertEqual("2) Second item with some even more inglish text. Let me speek from my heart", list_elements[1]["text"].strip()) table = content["tables"][0] - self.assertListEqual(['London', 'The capital of Great Britain'], table["cells"][0]) - self.assertListEqual(['Speek', 'From my heart'], table["cells"][1]) + self.assertListEqual(["London", "The capital of Great Britain"], table["cells"][0]) + self.assertListEqual(["Speek", "From my heart"], table["cells"][1]) diff --git a/tests/api_tests/test_api_diploma.py b/tests/api_tests/test_api_diploma.py new file mode 100644 index 00000000..374860e2 --- /dev/null +++ b/tests/api_tests/test_api_diploma.py @@ -0,0 +1,91 @@ +import os + +from tests.api_tests.abstract_api_test import AbstractTestApiDocReader + + +class TestApiDiploma(AbstractTestApiDocReader): + + def _get_abs_path(self, file_name: str) -> str: + return os.path.join(self.data_directory_path, "diplomas", file_name) + + def test_diploma_pdf(self) -> None: + file_name = "diploma.pdf" + result = self._send_request(file_name, dict(document_type="diploma", pdf_with_text_layer="tabby")) + structure = result["content"]["structure"] + + node = self._get_by_tree_path(structure, "0") + self.assertEqual("Москва, 2021 г.", node["text"].strip()[-15:]) + self.assertEqual("root", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.0") + self.assertEqual("", node["text"]) + self.assertEqual("body", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.1") + self.assertEqual("СОДЕРЖАНИЕ", node["text"].strip()) + self.assertEqual("toc", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.1.0") + self.assertEqual("ВВЕДЕНИЕ", node["text"][:8]) + self.assertEqual("toc_item", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.2") + self.assertEqual("ВВЕДЕНИЕ", node["text"].strip()) + self.assertEqual("named_item", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.3") + self.assertEqual("1. ТЕОРЕТИЧЕСКОЕ", node["text"][:16]) + self.assertEqual("named_item", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.4") + self.assertEqual("2. АНАЛИЗ", node["text"][:9]) + self.assertEqual("named_item", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.5") + self.assertEqual("ЗАКЛЮЧЕНИЕ", node["text"].strip()) + self.assertEqual("named_item", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.6") + self.assertEqual("БИБЛИОГРАФИЧЕСКИЙ СПИСОК", node["text"].strip()) + self.assertEqual("named_item", node["metadata"]["paragraph_type"]) + + def test_diploma_docx(self) -> None: + file_name = "diploma.docx" + result = self._send_request(file_name, dict(document_type="diploma")) + structure = result["content"]["structure"] + + node = self._get_by_tree_path(structure, "0") + self.assertEqual("Москва 2023 г.", node["text"].strip()[-14:]) + self.assertEqual("root", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.0") + self.assertEqual("", node["text"]) + self.assertEqual("body", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.1") + self.assertEqual("Содержание", node["text"].strip()) + self.assertEqual("toc", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.1.0") + self.assertEqual("Введение", node["text"][:8]) + self.assertEqual("toc_item", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.2") + self.assertEqual("Введение", node["text"].strip()) + self.assertEqual("named_item", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.3") + self.assertEqual("Глава 1.", node["text"][:8]) + self.assertEqual("named_item", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.4") + self.assertEqual("Глава 2.", node["text"][:8]) + self.assertEqual("named_item", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.5") + self.assertEqual("Глава 3.", node["text"][:8]) + self.assertEqual("named_item", node["metadata"]["paragraph_type"]) + + node = self._get_by_tree_path(structure, "0.6") + self.assertEqual("Список литературы", node["text"].strip()) + self.assertEqual("named_item", node["metadata"]["paragraph_type"]) diff --git a/tests/api_tests/test_api_doctype_law.py b/tests/api_tests/test_api_doctype_law.py index 6fe3079c..558d130a 100644 --- a/tests/api_tests/test_api_doctype_law.py +++ b/tests/api_tests/test_api_doctype_law.py @@ -1,10 +1,9 @@ import os import unittest from collections import Counter -from typing import List, Dict +from typing import Dict, List from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation - from tests.api_tests.abstract_api_test import AbstractTestApiDocReader from tests.test_utils import tree2linear @@ -24,8 +23,7 @@ def test_law_txt(self) -> None: self.assertIn("ЗАКОН", structure["text"]) self.assertEqual(0, structure["metadata"]["line_id"]) self.assertEqual("root", structure["metadata"]["paragraph_type"]) - self.assertEqual("Статья 1.1. Законодательство города Москвы об административных", - body["subparagraphs"][0]["text"].split("\n")[0].strip()) + self.assertEqual("Статья 1.1. Законодательство города Москвы об административных", body["subparagraphs"][0]["text"].split("\n")[0].strip()) self.assertTrue(body["subparagraphs"][2]["text"].strip().startswith("Статья")) def test_law_html(self) -> None: @@ -36,7 +34,7 @@ def test_law_html(self) -> None: self.__test_law_tree_sanity(document_tree) body = self._get_body(document_tree) self.assertEqual("articlePart", body["subparagraphs"][0]["metadata"]["paragraph_type"]) - self.assertIn('У К А З', document_tree["text"]) + self.assertIn("У К А З", document_tree["text"]) def test_law_image(self) -> None: file_name = "law_image.png" @@ -117,47 +115,44 @@ def test_law_article_multiline(self) -> None: self.__test_law_tree_sanity(document_tree) article = self._get_by_tree_path(document_tree, "0.0.0") - self.assertEqual('Статья 20.1. Представление сведений о расходах', article["text"].strip()) - self.assertEqual('article', article["metadata"]["paragraph_type"]) + self.assertEqual("Статья 20.1. Представление сведений о расходах", article["text"].strip()) + self.assertEqual("article", article["metadata"]["paragraph_type"]) item = self._get_by_tree_path(document_tree, "0.0.0.1") - self.assertEqual('1.', item["text"].strip()) - self.assertEqual('articlePart', item["metadata"]["paragraph_type"]) + self.assertEqual("1.", item["text"].strip()) + self.assertEqual("articlePart", item["metadata"]["paragraph_type"]) item = self._get_by_tree_path(document_tree, "0.0.0.2") - self.assertEqual('2.', item["text"].strip()) - self.assertEqual('articlePart', item["metadata"]["paragraph_type"]) + self.assertEqual("2.", item["text"].strip()) + self.assertEqual("articlePart", item["metadata"]["paragraph_type"]) item = self._get_by_tree_path(document_tree, "0.0.0.3") - self.assertEqual('3.', item["text"].strip()) - self.assertEqual('articlePart', item["metadata"]["paragraph_type"]) + self.assertEqual("3.", item["text"].strip()) + self.assertEqual("articlePart", item["metadata"]["paragraph_type"]) article = self._get_by_tree_path(document_tree, "0.0.1") - self.assertEqual('Статья 20.2. Представление сведений о размещении информации в\n' + self.assertEqual("Статья 20.2. Представление сведений о размещении информации в\n" 'информационно-телекоммуникационной сети "Интернет"', article["text"].strip()) - self.assertEqual('article', article["metadata"]["paragraph_type"]) + self.assertEqual("article", article["metadata"]["paragraph_type"]) item = self._get_by_tree_path(document_tree, "0.0.1.1") - self.assertEqual('1.', item["text"].strip()) - self.assertEqual('articlePart', item["metadata"]["paragraph_type"]) + self.assertEqual("1.", item["text"].strip()) + self.assertEqual("articlePart", item["metadata"]["paragraph_type"]) def test_law_pdf_uc(self) -> None: file_name = "ukodeksrf.pdf" result = self._send_request(file_name, dict(document_type="law"), expected_code=200) document = result["content"]["structure"] - self.assertEqual("Уголовный кодекс Российской Федерации от 13 июня 1996 г. М 63-ФЗ", - document["text"].split("\n")[0].strip()) + self.assertEqual("Уголовный кодекс Российской Федерации от 13 июня 1996 г. М 63-ФЗ", document["text"].split("\n")[0].strip()) section = self._get_by_tree_path(document, "0.0.0") self.assertEqual("Раздел I. Уголовный закон", section["text"].strip()) self.assertEqual("section", section["metadata"]["paragraph_type"]) subsection = self._get_by_tree_path(document, "0.0.0.0") - self.assertEqual("Глава 1. Задачи и принципы Уголовного кодекса Российской Федерации", - subsection["text"].strip()) + self.assertEqual("Глава 1. Задачи и принципы Уголовного кодекса Российской Федерации", subsection["text"].strip()) self.assertEqual("chapter", subsection["metadata"]["paragraph_type"]) article = self._get_by_tree_path(document, "0.0.0.0.0") - self.assertEqual("Статья 1. Уголовное законодательство Российской Федерации", - article["text"].strip()) + self.assertEqual("Статья 1. Уголовное законодательство Российской Федерации", article["text"].strip()) self.assertEqual("article", article["metadata"]["paragraph_type"]) def test_law_pdf_with_applications(self) -> None: @@ -167,8 +162,8 @@ def test_law_pdf_with_applications(self) -> None: document_tree = result["content"]["structure"] self.__test_law_tree_sanity(document_tree) - self.assertIn('Утвержден\n', self._get_by_tree_path(document_tree, "0.1")['text']) - self.assertIn('Приложение\n', self._get_by_tree_path(document_tree, "0.2")['text']) + self.assertIn("Утвержден\n", self._get_by_tree_path(document_tree, "0.1")["text"]) + self.assertIn("Приложение\n", self._get_by_tree_path(document_tree, "0.2")["text"]) def test_chapter(self) -> None: # фстэк 17 это приказ, и в нем I. Общие положения трактуется как глава @@ -225,23 +220,23 @@ def test_law_paragraphs(self) -> None: apps = self._get_applications(document_tree) self.assertEqual(5, len(apps)) node = self._get_by_tree_path(document_tree, "0.1") - self.assertTrue(node['text'].strip().startswith("Приложение N 1")) - self.assertTrue(node['text'].strip().endswith("Российской Федерации")) - self.assertEqual(node['metadata']['paragraph_type'], 'application') + self.assertTrue(node["text"].strip().startswith("Приложение N 1")) + self.assertTrue(node["text"].strip().endswith("Российской Федерации")) + self.assertEqual(node["metadata"]["paragraph_type"], "application") node = self._get_by_tree_path(document_tree, "0.2") - self.assertTrue(node['text'].strip().startswith("Приложение N 2")) - self.assertIn("Департамента управления делами", node['text']) - self.assertTrue(node['text'].strip().endswith("____________")) - self.assertEqual(node['metadata']['paragraph_type'], 'application') + self.assertTrue(node["text"].strip().startswith("Приложение N 2")) + self.assertIn("Департамента управления делами", node["text"]) + self.assertTrue(node["text"].strip().endswith("____________")) + self.assertEqual(node["metadata"]["paragraph_type"], "application") node = self._get_by_tree_path(document_tree, "0.3") - self.assertTrue('Приложение N 3' in node['text']) - self.assertEqual(node['metadata']['paragraph_type'], 'application') + self.assertTrue("Приложение N 3" in node["text"]) + self.assertEqual(node["metadata"]["paragraph_type"], "application") node = self._get_by_tree_path(document_tree, "0.4") - self.assertTrue('Приложение N 4' in node['text']) - self.assertEqual(node['metadata']['paragraph_type'], 'application') + self.assertTrue("Приложение N 4" in node["text"]) + self.assertEqual(node["metadata"]["paragraph_type"], "application") node = self._get_by_tree_path(document_tree, "0.5") - self.assertTrue('Приложение N 5' in node['text']) - self.assertEqual(node['metadata']['paragraph_type'], 'application') + self.assertTrue("Приложение N 5" in node["text"]) + self.assertEqual(node["metadata"]["paragraph_type"], "application") expected = "Отдел мобилизационной подготовки:" node = self._get_by_tree_path(document_tree, "0.3.0.6.0") @@ -251,31 +246,31 @@ def test_law_paragraphs(self) -> None: node = self._get_by_tree_path(document_tree, "0.3.0.6.1") self.assertEqual(expected, node["text"].strip()[0: len(expected)]) - expected = 'I. Общие положения' + expected = "I. Общие положения" node = self._get_by_tree_path(document_tree, "0.1.0") self.assertEqual(expected, node["text"].strip()[0: len(expected)]) self.assertEqual("chapter", node["metadata"]["paragraph_type"]) - expected = 'II. Процедуры, направленные на выявление и предотвращение\n' \ - ' нарушений законодательства Российской Федерации\n ' \ - ' в сфере персональных данных' + expected = "II. Процедуры, направленные на выявление и предотвращение\n" \ + " нарушений законодательства Российской Федерации\n " \ + " в сфере персональных данных" node = self._get_by_tree_path(document_tree, "0.1.1") self.assertEqual(expected, node["text"].strip()[0: len(expected)]) self.assertEqual("chapter", node["metadata"]["paragraph_type"]) - expected = 'III. Цели обработки персональных данных, содержание\n' \ - ' обрабатываемых персональных данных, категории субъектов,' \ - '\n персональные данные которых обрабатываются' + expected = "III. Цели обработки персональных данных, содержание\n" \ + " обрабатываемых персональных данных, категории субъектов," \ + "\n персональные данные которых обрабатываются" node = self._get_by_tree_path(document_tree, "0.1.2") self.assertEqual(expected, node["text"].strip()[0: len(expected)]) self.assertEqual("chapter", node["metadata"]["paragraph_type"]) - expected = 'IV. Содержание обрабатываемых персональных данных,\n' \ - ' категории субъектов, персональные данные которых обрабатываются,\n' \ - ' сроки их обработки и хранения, порядок уничтожения\n' \ - ' при достижении целей обработки или при наступлении\n' \ - ' иных законных оснований, определенные для каждой цели\n' \ - ' обработки персональных данных' + expected = "IV. Содержание обрабатываемых персональных данных,\n" \ + " категории субъектов, персональные данные которых обрабатываются,\n" \ + " сроки их обработки и хранения, порядок уничтожения\n" \ + " при достижении целей обработки или при наступлении\n" \ + " иных законных оснований, определенные для каждой цели\n" \ + " обработки персональных данных" node = self._get_by_tree_path(document_tree, "0.1.3") self.assertEqual(expected, node["text"].strip()[0: len(expected)]) self.assertEqual("chapter", node["metadata"]["paragraph_type"]) @@ -291,37 +286,36 @@ def test_foiv_txt(self) -> None: self.assertTrue("Приложение" in self._get_by_tree_path(document_tree, "0.1")["text"].strip()) node = self._get_by_tree_path(document_tree, "0.1.7") self.assertEqual("8.", node["text"].strip()) - self.assertEqual("item", node['metadata']['paragraph_type']) + self.assertEqual("item", node["metadata"]["paragraph_type"]) node = self._get_by_tree_path(document_tree, "0.1.7.4") self.assertEqual("8.4.", node["text"].strip()) - self.assertEqual("item", node['metadata']['paragraph_type']) + self.assertEqual("item", node["metadata"]["paragraph_type"]) node = self._get_by_tree_path(document_tree, "0.1.7.4.1") self.assertEqual("1)", node["text"].strip()) - self.assertEqual("subitem", node['metadata']['paragraph_type']) + self.assertEqual("subitem", node["metadata"]["paragraph_type"]) node = self._get_by_tree_path(document_tree, "0.1.7.4.1.1") self.assertEqual("а)", node["text"].strip()) - self.assertEqual("subitem", node['metadata']['paragraph_type']) + self.assertEqual("subitem", node["metadata"]["paragraph_type"]) def test_application_txt(self) -> None: file_name = "law_application.txt" result = self._send_request(file_name, dict(document_type="law"), expected_code=200) document_tree = result["content"]["structure"] node = self._get_by_tree_path(document_tree, "0.1") - self.assertTrue(node["text"].strip().startswith( - "УТВЕРЖДЕНЫ\n\nпостановлением Правительства\n\nРоссийской Федерации")) - self.assertEqual("application", node['metadata']['paragraph_type']) + self.assertTrue(node["text"].strip().startswith("УТВЕРЖДЕНЫ\n\nпостановлением Правительства\n\nРоссийской Федерации")) + self.assertEqual("application", node["metadata"]["paragraph_type"]) node = self._get_by_tree_path(document_tree, "0.1.0") self.assertEqual(node["text"].strip(), "I. Общие положения") - self.assertEqual("subsection", node['metadata']['paragraph_type']) + self.assertEqual("subsection", node["metadata"]["paragraph_type"]) node = self._get_by_tree_path(document_tree, "0.1.0.1") self.assertEqual(node["text"].strip(), "1.") - self.assertEqual("articlePart", node['metadata']['paragraph_type']) + self.assertEqual("articlePart", node["metadata"]["paragraph_type"]) node = self._get_by_tree_path(document_tree, "0.1.4.1.6") self.assertEqual(node["text"].strip(), "е)") - self.assertEqual("subitem", node['metadata']['paragraph_type']) + self.assertEqual("subitem", node["metadata"]["paragraph_type"]) @unittest.skip("TODO fix this") def test_number_not_part(self) -> None: @@ -331,7 +325,7 @@ def test_number_not_part(self) -> None: self.__test_law_tree_sanity(document_tree) node = self._get_by_tree_path(document_tree, "0.0.3.5.0.0") self.assertTrue(node["text"].strip().endswith("2 настоящей статьи.")) - self.assertEqual("raw_text", node['metadata']['paragraph_type']) + self.assertEqual("raw_text", node["metadata"]["paragraph_type"]) def test_application_multiline(self) -> None: file_name = "13.txt" @@ -368,9 +362,7 @@ def test_application_multiline(self) -> None: @unittest.skip("TODO fix this") def test_auto_paragraph(self) -> None: file_name = "fstec_1_cut.pdf" - result = self._send_request(file_name, dict(document_type="law", - pdf_with_text_layer="false", - is_one_column_document="auto")) + result = self._send_request(file_name, dict(document_type="law", pdf_with_text_layer="false", is_one_column_document="auto")) tree = result["content"]["structure"] self.__test_law_tree_sanity(tree) warnings = result["warnings"] @@ -489,8 +481,7 @@ def test_law_with_super_elements(self) -> None: # source html-document had "Статья 16^1" node = self._get_by_tree_path(document_tree, "0.0.0.0.15") - self.assertEqual("Статья 161. Переход к рассмотрению дела по правилам гражданского судопроизводства", - node["text"].strip()) + self.assertEqual("Статья 161. Переход к рассмотрению дела по правилам гражданского судопроизводства", node["text"].strip()) self.assertEqual("article", node["metadata"]["paragraph_type"]) def test_law_html_with_table(self) -> None: @@ -504,14 +495,9 @@ def test_law_html_with_table(self) -> None: self.assertEqual(1, len(tables)) table = tables[0]["cells"] self.assertListEqual(["№\nп/п", "", "Ф.И.О.", "Должность"], list(map(str.strip, table[0]))) - self.assertListEqual(["1", - "Председатель\nкомиссии", - "Городецкий \n\nЯрослав Иванович", - "первый заместитель министра"], - list(map(str.strip, table[1]))) + self.assertListEqual(["1", "Председатель\nкомиссии", "Городецкий \n\nЯрослав Иванович", "первый заместитель министра"], list(map(str.strip, table[1]))) - self.assertEqual("начальник управления по гражданской обороне, чрезвычайным ситуациям и пожарной безопасности", - table[8][3].strip()) + self.assertEqual("начальник управления по гражданской обороне, чрезвычайным ситуациям и пожарной безопасности", table[8][3].strip()) def test_law_html_with_part_item_quotes(self) -> None: # документ разбирается как ФОИВ, тип вершин меняется. Тест теряет смысл в этом контексте @@ -521,22 +507,22 @@ def test_law_html_with_part_item_quotes(self) -> None: document_tree = result["content"]["structure"] self.__test_law_tree_sanity(document_tree) item = self._get_by_tree_path(document_tree, "0.0.0") - self.assertEqual('articlePart', item["metadata"]["paragraph_type"]) + self.assertEqual("articlePart", item["metadata"]["paragraph_type"]) # Спека на ФОИВы говорит: Пункты нумеруются арабскими цифрами с точкой и заголовков не имеют. self.assertEqual("1.", item["text"].strip()) subitem = self._get_by_tree_path(document_tree, "0.0.0.1") - self.assertEqual('item', subitem['metadata']['paragraph_type']) - self.assertEqual('1)', subitem['text'].strip()) + self.assertEqual("item", subitem["metadata"]["paragraph_type"]) + self.assertEqual("1)", subitem["text"].strip()) # цитата quotation = self._get_by_tree_path(document_tree, "0.0.0.1.2") - self.assertEqual('raw_text', quotation['metadata']['paragraph_type']) - self.assertTrue(quotation['text'].strip().startswith('16)')) + self.assertEqual("raw_text", quotation["metadata"]["paragraph_type"]) + self.assertTrue(quotation["text"].strip().startswith("16)")) subitem = self._get_by_tree_path(document_tree, "0.0.0.2") - self.assertEqual('item', subitem['metadata']['paragraph_type']) - self.assertEqual('2)', subitem['text'].strip()) + self.assertEqual("item", subitem["metadata"]["paragraph_type"]) + self.assertEqual("2)", subitem["text"].strip()) quotation = self._get_by_tree_path(document_tree, "0.0.0.2.2") - self.assertEqual('raw_text', quotation['metadata']['paragraph_type']) - self.assertTrue(quotation['text'].strip().startswith('1.')) + self.assertEqual("raw_text", quotation["metadata"]["paragraph_type"]) + self.assertTrue(quotation["text"].strip().startswith("1.")) def test_law_html_with_applications(self) -> None: file_name = "with_applications.html" @@ -546,13 +532,13 @@ def test_law_html_with_applications(self) -> None: self.__test_law_tree_sanity(document_tree) cellar = self._get_by_tree_path(document_tree, "0.1") - self.assertEqual('Торбеевского муниципального ра', cellar["text"][:30]) - self.assertEqual('cellar', cellar["metadata"]["paragraph_type"]) + self.assertEqual("Торбеевского муниципального ра", cellar["text"][:30]) + self.assertEqual("cellar", cellar["metadata"]["paragraph_type"]) application1 = self._get_by_tree_path(document_tree, "0.2") - self.assertIn('Приложение 1', application1["text"]) + self.assertIn("Приложение 1", application1["text"]) application2 = self._get_by_tree_path(document_tree, "0.3") - self.assertIn('Приложение 1', application2["text"]) # there are two Приложение 1 applications in the document + self.assertIn("Приложение 1", application2["text"]) # there are two Приложение 1 applications in the document application3 = self._get_by_tree_path(document_tree, "0.4") self.assertIn("Приложение 2", application3["text"]) application4 = self._get_by_tree_path(document_tree, "0.5") @@ -568,9 +554,8 @@ def test_law_html_with_applications_after_header(self) -> None: applications = self._get_applications(document_tree) self.__test_law_tree_sanity(document_tree) - self.assertIn('Приложение\nк приказу МВД России\nот __.__.2019 ' - 'N ___\nПЕРЕЧЕНЬ\nИЗМЕНЕНИЙ, ВНОСИМЫХ В НОРМАТИВНЫЕ ПРАВОВЫЕ АКТЫ МВД РОССИИ', - applications[0]['text']) + self.assertIn("Приложение\nк приказу МВД России\nот __.__.2019 N ___\nПЕРЕЧЕНЬ\nИЗМЕНЕНИЙ, ВНОСИМЫХ В НОРМАТИВНЫЕ ПРАВОВЫЕ АКТЫ МВД РОССИИ", + applications[0]["text"]) def test_foiv_html(self) -> None: """ @@ -583,7 +568,7 @@ def test_foiv_html(self) -> None: node = self._get_by_tree_path(document_tree, "0.1.") self.assertEqual("Министр\nВ.Н. Фальков", node["text"].strip()) - self.assertEqual("cellar", node['metadata']['paragraph_type']) + self.assertEqual("cellar", node["metadata"]["paragraph_type"]) application1 = self._get_by_tree_path(document_tree, "0.2")["text"].strip() self.assertTrue(application1.startswith("Приложение")) @@ -592,29 +577,29 @@ def test_foiv_html(self) -> None: node = self._get_by_tree_path(document_tree, "0.2.0") self.assertEqual("I. Общие положения", node["text"].strip()) - self.assertEqual("chapter", node['metadata']['paragraph_type']) + self.assertEqual("chapter", node["metadata"]["paragraph_type"]) node = self._get_by_tree_path(document_tree, "0.2.1") self.assertEqual("II. Требования к структуре программы специалитета", node["text"].strip()) - self.assertEqual("chapter", node['metadata']['paragraph_type']) + self.assertEqual("chapter", node["metadata"]["paragraph_type"]) node = self._get_by_tree_path(document_tree, "0.2.2") self.assertEqual("III. Требования к результатам освоения\nпрограммы специалитета", node["text"].strip()) - self.assertEqual("chapter", node['metadata']['paragraph_type']) + self.assertEqual("chapter", node["metadata"]["paragraph_type"]) node = self._get_by_tree_path(document_tree, "0.2.3") self.assertEqual("IV. Требования к условиям реализации программы специалитета", node["text"].strip()) - self.assertEqual("chapter", node['metadata']['paragraph_type']) + self.assertEqual("chapter", node["metadata"]["paragraph_type"]) node = self._get_by_tree_path(document_tree, "0.2.3.0") self.assertEqual("4.1.", node["text"].strip()) - self.assertEqual("item", node['metadata']['paragraph_type']) + self.assertEqual("item", node["metadata"]["paragraph_type"]) node = self._get_by_tree_path(document_tree, "0.2.3.1") self.assertEqual("4.2.", node["text"].strip()) - self.assertEqual("item", node['metadata']['paragraph_type']) + self.assertEqual("item", node["metadata"]["paragraph_type"]) node = self._get_by_tree_path(document_tree, "0.2.3.1.1") self.assertEqual("4.2.1.", node["text"].strip()) - self.assertEqual("item", node['metadata']['paragraph_type']) + self.assertEqual("item", node["metadata"]["paragraph_type"]) def test_html_invisible_table(self) -> None: file_name = "invisibly_table4.html" @@ -628,5 +613,5 @@ def __test_law_tree_sanity(self, tree: Dict[str, dict]) -> None: cnt = Counter() for item in linear: cnt[item["metadata"]["paragraph_type"]] += 1 - self.assertEqual(1, cnt["root"], "Document should have only one root, get {}".format(cnt["root"])) - self.assertEqual(1, cnt["body"], "Document should have only one body, get {}".format(cnt["body"])) + self.assertEqual(1, cnt["root"], f"Document should have only one root, get {cnt['root']}") + self.assertEqual(1, cnt["body"], f"Document should have only one body, get {cnt['body']}") diff --git a/tests/api_tests/test_api_doctype_tz.py b/tests/api_tests/test_api_doctype_tz.py index 11247e2c..f7e11719 100644 --- a/tests/api_tests/test_api_doctype_tz.py +++ b/tests/api_tests/test_api_doctype_tz.py @@ -105,8 +105,7 @@ def _check_content(self, result: dict, file_name: str) -> None: self.assertGreater(len(annotations), 0) self._check_tz_tree(structure) - self.assertEqual("ТЕХНИЧЕСКОЕ ЗАДАНИЕ \nНа разведение и уход за альпаками. Принято министерством по делам альпаководства.", - structure["text"].strip()) + self.assertEqual("ТЕХНИЧЕСКОЕ ЗАДАНИЕ \nНа разведение и уход за альпаками. Принято министерством по делам альпаководства.", structure["text"].strip()) toc = structure["subparagraphs"][0] self.assertEqual("содержание", toc["text"].strip().lower()) self.assertEqual("toc", toc["metadata"]["paragraph_type"]) diff --git a/tests/api_tests/test_api_email.py b/tests/api_tests/test_api_email.py new file mode 100644 index 00000000..055c86fe --- /dev/null +++ b/tests/api_tests/test_api_email.py @@ -0,0 +1,50 @@ +import os + +from tests.api_tests.abstract_api_test import AbstractTestApiDocReader + + +class TestApiEmailReader(AbstractTestApiDocReader): + + def _get_abs_path(self, file_name: str) -> str: + return os.path.join(self.data_directory_path, "eml", file_name) + + def test_email_file(self) -> None: + file_name = "spam_mail.eml" + result = self._send_request(file_name, data={"with_attachments": "true"}) + attachments = result["attachments"] + + self.assertEqual(len(attachments), 1) # message header fields + self.assertIn("message_header_", attachments[0]['metadata']['file_name']) + content = result["content"] + structure = content["structure"] + self._check_tree_sanity(structure) + self.assertEqual("[Spam]Artificial flowers for decors", structure["text"]) + + # require fields [subject, from, to, cc, bcc, date, reply-to] + + from_message = structure["subparagraphs"][1] + to_message = structure["subparagraphs"][2] + self.assertEqual("modis@ispras.ru", to_message["text"]) + self.assertEqual("to", to_message["metadata"]["paragraph_type"]) + self.assertEqual('"sunny_goldensun@126.com" ', from_message["text"]) + self.assertEqual("from", from_message["metadata"]["paragraph_type"]) + + def test_email_with_attachments(self) -> None: + file_name = "message.eml" + result = self._send_request(file_name, data={"with_attachments": "true"}) + structure = result["content"]["structure"] + attachments = result["attachments"] + self._check_tree_sanity(structure) + + self.assertEqual("TetSubj", structure["text"]) + from_message = structure["subparagraphs"][1] + to_message = structure["subparagraphs"][2] + self.assertEqual('"bb@bb.bb" ', to_message["text"]) + self.assertEqual("to", to_message["metadata"]["paragraph_type"]) + self.assertEqual('"aa@aa.aa" ', from_message["text"]) + self.assertEqual("from", from_message["metadata"]["paragraph_type"]) + + self.assertEqual(3, len(attachments)) + self.assertIn("message_header_", attachments[0]["metadata"]["file_name"]) + self.assertEqual("grafana.jpg", attachments[1]["metadata"]["file_name"]) + self.assertEqual("KY100Product SheetProduct Sheet.pdf", attachments[2]["metadata"]["file_name"]) diff --git a/tests/api_tests/test_api_format_archives.py b/tests/api_tests/test_api_format_archives.py index 0fffa991..63663149 100644 --- a/tests/api_tests/test_api_format_archives.py +++ b/tests/api_tests/test_api_format_archives.py @@ -10,38 +10,38 @@ def _get_abs_path(self, file_name: str) -> str: def _check_archive_with_english_doc(self, file_name: str) -> None: result = self._send_request(file_name, dict(with_attachments="True")) - self.assertEqual(len(result['attachments']), 4) - english_doc = [doc for doc in result['attachments'] if doc["metadata"]["file_name"].startswith("english_doc")][0] + self.assertEqual(len(result["attachments"]), 4) + english_doc = [doc for doc in result["attachments"] if doc["metadata"]["file_name"].startswith("english_doc")][0] self._check_english_doc(english_doc) def test_zip(self) -> None: file_name = "arch_with_attachs.zip" result = self._send_request(file_name, dict(with_attachments="True")) - self.assertEqual(len(result['attachments']), 4) + self.assertEqual(len(result["attachments"]), 4) def test_tar(self) -> None: file_name = "arch_with_attachs.tar" result = self._send_request(file_name, dict(with_attachments="True")) - self.assertEqual(len(result['attachments']), 4) + self.assertEqual(len(result["attachments"]), 4) def test_targz(self) -> None: file_name = "arch_with_attachs.tar.gz" result = self._send_request(file_name, dict(with_attachments="True")) - self.assertEqual(len(result['attachments']), 4) + self.assertEqual(len(result["attachments"]), 4) def test_rar(self) -> None: file_name = "arch_with_attachs.rar" result = self._send_request(file_name, dict(with_attachments="True")) - self.assertEqual(len(result['attachments']), 4) + self.assertEqual(len(result["attachments"]), 4) def test_7zip(self) -> None: file_name = "arch_with_attachs.7z" result = self._send_request(file_name, dict(with_attachments="True")) - self.assertEqual(len(result['attachments']), 4) + self.assertEqual(len(result["attachments"]), 4) def test_zip_as_archive(self) -> None: file_name = "zipka_eng.zip" @@ -66,15 +66,15 @@ def test_archive_subfolder_7z(self) -> None: def test_zip_with_unsupported_file(self) -> None: file_name = "arch_with_unsupport_atchs.zip" result = self._send_request(file_name, dict(with_attachments="True")) - attachs = result['attachments'] + attachs = result["attachments"] self.assertEqual(len(attachs), 6) unsupported = [att for att in attachs if att["metadata"]["file_name"] == "file.bin"][0]["metadata"] - self.assertEqual(unsupported['file_type'], "application/octet-stream") + self.assertEqual(unsupported["file_type"], "application/octet-stream") def test_broken_archive(self) -> None: file_name = "broken.zip" result = self._send_request(file_name, dict(with_attachments="True")) - self.assertEqual(len(result['attachments']), 7) - english_doc = [doc for doc in result['attachments'] if doc["metadata"]["file_name"].startswith("english_doc")][0] + self.assertEqual(len(result["attachments"]), 7) + english_doc = [doc for doc in result["attachments"] if doc["metadata"]["file_name"].startswith("english_doc")][0] self._check_english_doc(english_doc) diff --git a/tests/api_tests/test_api_format_csv.py b/tests/api_tests/test_api_format_csv.py index bd32657f..ffccb2e2 100644 --- a/tests/api_tests/test_api_format_csv.py +++ b/tests/api_tests/test_api_format_csv.py @@ -1,5 +1,4 @@ import os -import unittest from typing import List from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -58,11 +57,9 @@ def test_csv_books(self) -> None: tables = result["content"]["tables"] table = tables[0]["cells"] - self.assertListEqual(["id", "cat", "name", "price", "inStock", "author", "series_t", "sequence_i", "genre_s"], - table[0]) - self.assertListEqual( - ["055357342X", "book", "A Storm of Swords", "7.99", "true", "George R.R. Martin", "A Song of Ice and Fire", - "3", "fantasy"], table[3]) + self.assertListEqual(["id", "cat", "name", "price", "inStock", "author", "series_t", "sequence_i", "genre_s"], table[0]) + self.assertListEqual(["055357342X", "book", "A Storm of Swords", "7.99", "true", "George R.R. Martin", "A Song of Ice and Fire", "3", "fantasy"], + table[3]) def test_csv_books2(self) -> None: file_name = "books_2.csv" @@ -70,12 +67,10 @@ def test_csv_books2(self) -> None: self.assertIn("delimiter is ','", result["warnings"]) tables = result["content"]["tables"] table = tables[0]["cells"] - self.assertListEqual(['0553573403', 'book', "A Game of Throne, kings and other stuff", - '7.99', 'True', 'George R.R. Martin', 'A Song of Ice and Fire', '1', 'fantasy'], - table[1]) - self.assertListEqual( - ["0553579908", "book", 'A Clash of "Kings"', '7.99', 'True', 'George R.R. Martin', - 'A Song of Ice and Fire', '2', 'fantasy'], table[2]) + self.assertListEqual(["0553573403", "book", "A Game of Throne, kings and other stuff", "7.99", "True", "George R.R. Martin", "A Song of Ice and Fire", + "1", "fantasy"], table[1]) + self.assertListEqual(["0553579908", "book", 'A Clash of "Kings"', "7.99", "True", "George R.R. Martin", "A Song of Ice and Fire", "2", "fantasy"], + table[2]) def __check_content(self, tables: List[dict]) -> None: self.assertEqual(1, len(tables)) @@ -84,7 +79,7 @@ def __check_content(self, tables: List[dict]) -> None: self.assertEqual("1", rows1[0][0]) self.assertEqual("2", rows1[0][1]) - self.assertEqual('3', rows1[0][2]) + self.assertEqual("3", rows1[0][2]) self.assertEqual("2", rows1[1][0]) self.assertEqual("1", rows1[1][1]) diff --git a/tests/api_tests/test_api_format_docx.py b/tests/api_tests/test_api_format_docx.py index e7c9a5b4..a74a95ef 100644 --- a/tests/api_tests/test_api_format_docx.py +++ b/tests/api_tests/test_api_format_docx.py @@ -1,7 +1,7 @@ import os -from tests.api_tests.abstract_api_test import AbstractTestApiDocReader from dedoc.data_structures.concrete_annotations.linked_text_annotation import LinkedTextAnnotation +from tests.api_tests.abstract_api_test import AbstractTestApiDocReader from tests.test_utils import get_by_tree_path @@ -25,7 +25,7 @@ def test_docx(self) -> None: file_name = "example.docx" result = self._send_request(file_name, data={"insert_table": True}) self.__check_doc_like(result) - self._check_metainfo(result['metadata'], 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', file_name) + self._check_metainfo(result["metadata"], "application/vnd.openxmlformats-officedocument.wordprocessingml.document", file_name) def test_docx_ujson(self) -> None: file_name = "example.docx" @@ -36,40 +36,39 @@ def test_doc(self) -> None: file_name = "example.doc" result = self._send_request(file_name, data={"insert_table": True, "structure_type": "tree"}) self.__check_doc_like(result) - self._check_metainfo(result['metadata'], 'application/msword', file_name) + self._check_metainfo(result["metadata"], "application/msword", file_name) def test_odt(self) -> None: file_name = "example.odt" result = self._send_request(file_name, data={"insert_table": True}) self.__check_doc_like(result) - self._check_metainfo(result['metadata'], 'application/vnd.oasis.opendocument.text', file_name) + self._check_metainfo(result["metadata"], "application/vnd.oasis.opendocument.text", file_name) def test_doc_insert_table(self) -> None: file_name = "example.doc" result = self._send_request(file_name, data=dict(structure_type="tree", insert_table=True)) self.__check_doc_like_insert_table(result) - self._check_metainfo(result['metadata'], 'application/msword', file_name) + self._check_metainfo(result["metadata"], "application/msword", file_name) def test_docx_insert_table(self) -> None: file_name = "example.docx" result = self._send_request(file_name, data=dict(structure_type="tree", insert_table=True)) self.__check_doc_like_insert_table(result) - self._check_metainfo(result['metadata'], - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', file_name) + self._check_metainfo(result["metadata"], "application/vnd.openxmlformats-officedocument.wordprocessingml.document", file_name) def test_odt_insert_table(self) -> None: file_name = "example.odt" result = self._send_request(file_name, data=dict(structure_type="tree", insert_table=True)) self.__check_doc_like_insert_table(result) - self._check_metainfo(result['metadata'], 'application/vnd.oasis.opendocument.text', file_name) + self._check_metainfo(result["metadata"], "application/vnd.oasis.opendocument.text", file_name) def test_odt_with_split(self) -> None: file_name = "ТЗ_ГИС_3 .odt" result = self._send_request(file_name) content = result["content"]["structure"] - self.assertEqual(content["subparagraphs"][0]["text"].strip(), 'Система должна обеспечивать защиту от несанкционированного доступа (НСД)') + self.assertEqual(content["subparagraphs"][0]["text"].strip(), "Система должна обеспечивать защиту от несанкционированного доступа (НСД)") def test_broken_conversion(self) -> None: file_name = "broken.odt" @@ -114,12 +113,12 @@ def test_return_html(self) -> None: self.assertIn("

id = 0 ; type = root

     Пример документа", result) self.assertTrue("\n" "\n" - "N\n" - "Фамилия\n" - "Имя\n" - "Организация\n" - "Телефон\n" - "Примечания\n" + 'N\n' + 'Фамилия\n' + 'Имя\n' + 'Организация\n' + 'Телефон\n' + 'Примечания\n' "" in result) def test_newline_tree(self) -> None: @@ -136,7 +135,7 @@ def test_docx_heading_new(self) -> None: def __check_doc_like(self, result: dict) -> None: content = result["content"]["structure"] self.assertEqual("", get_by_tree_path(content, "0")["text"]) - self.assertEqual('Пример документа\nГлава 1\nКакие то определения\nСтатья 1\nОпределим опрделения\nСтатья 2\nДадим пояснения', + self.assertEqual("Пример документа\nГлава 1\nКакие то определения\nСтатья 1\nОпределим опрделения\nСтатья 2\nДадим пояснения", get_by_tree_path(content, "0.0")["text"].strip()) self.assertEqual("1.2.1. Поясним за непонятное", get_by_tree_path(content, "0.1.0")["text"].strip()) self.assertEqual("1.2.2. Поясним за понятное", get_by_tree_path(content, "0.1.1")["text"].strip()) @@ -146,8 +145,7 @@ def __check_doc_like(self, result: dict) -> None: table1, table2 = result["content"]["tables"] - self.assertListEqual(["N", "Фамилия", "Имя", "Организация", "Телефон", "Примечания"], - table1["cells"][0]) + self.assertListEqual(["N", "Фамилия", "Имя", "Организация", "Телефон", "Примечания"], table1["cells"][0]) self.assertListEqual(["1", "Иванов", "Иван", "ИСП", "8-800", ""], table1["cells"][1]) self.assertListEqual(["Фамилия", "Имя", "Отчество"], table2["cells"][0]) @@ -165,7 +163,7 @@ def __check_doc_like(self, result: dict) -> None: def __check_doc_like_insert_table(self, result: dict) -> None: content = result["content"]["structure"] self.assertEqual("", get_by_tree_path(content, "0")["text"]) - self.assertEqual('Пример документа\nГлава 1\nКакие то определения\nСтатья 1\nОпределим опрделения\nСтатья 2\nДадим пояснения', + self.assertEqual("Пример документа\nГлава 1\nКакие то определения\nСтатья 1\nОпределим опрделения\nСтатья 2\nДадим пояснения", get_by_tree_path(content, "0.0")["text"].strip()) self.assertEqual("1.2.1. Поясним за непонятное", get_by_tree_path(content, "0.1.0")["text"].strip()) self.assertEqual("1.2.2. Поясним за понятное", get_by_tree_path(content, "0.1.1")["text"].strip()) diff --git a/tests/api_tests/test_api_format_docx_annotations.py b/tests/api_tests/test_api_format_docx_annotations.py index 9969563f..65df827d 100644 --- a/tests/api_tests/test_api_format_docx_annotations.py +++ b/tests/api_tests/test_api_format_docx_annotations.py @@ -8,104 +8,104 @@ class TestApiDocxAnnotations(AbstractTestApiDocReader): def test_example_1(self) -> None: result = self._send_request("example_1.docx", data={"structure_type": "linear"}) - subparagraphs = result['content']['structure']['subparagraphs'] - annotations = [subparagraph['annotations'] for subparagraph in subparagraphs] + subparagraphs = result["content"]["structure"]["subparagraphs"] + annotations = [subparagraph["annotations"] for subparagraph in subparagraphs] # bold, italic, underlined - self.assertIn({'start': 0, 'end': 11, 'name': 'style', 'value': 'Body'}, annotations[0]) - self.assertIn({'start': 0, 'end': 12, 'name': 'italic', 'value': 'True'}, annotations[1]) - self.assertIn({'start': 0, 'end': 10, 'name': 'bold', 'value': 'True'}, annotations[2]) - self.assertIn({'start': 0, 'end': 16, 'name': 'underlined', 'value': 'True'}, annotations[3]) - self.assertIn({'start': 0, 'end': 6, 'name': 'italic', 'value': 'True'}, annotations[4]) - self.assertIn({'start': 8, 'end': 13, 'name': 'bold', 'value': 'True'}, annotations[5]) - self.assertIn({'start': 0, 'end': 20, 'name': 'bold', 'value': 'True'}, annotations[6]) - self.assertIn({'start': 5, 'end': 20, 'name': 'underlined', 'value': 'True'}, annotations[6]) + self.assertIn({"start": 0, "end": 11, "name": "style", "value": "Body"}, annotations[0]) + self.assertIn({"start": 0, "end": 12, "name": "italic", "value": "True"}, annotations[1]) + self.assertIn({"start": 0, "end": 10, "name": "bold", "value": "True"}, annotations[2]) + self.assertIn({"start": 0, "end": 16, "name": "underlined", "value": "True"}, annotations[3]) + self.assertIn({"start": 0, "end": 6, "name": "italic", "value": "True"}, annotations[4]) + self.assertIn({"start": 8, "end": 13, "name": "bold", "value": "True"}, annotations[5]) + self.assertIn({"start": 0, "end": 20, "name": "bold", "value": "True"}, annotations[6]) + self.assertIn({"start": 5, "end": 20, "name": "underlined", "value": "True"}, annotations[6]) # alignment - self.assertIn({'start': 0, 'end': 10, 'name': 'alignment', 'value': 'left'}, annotations[8]) - self.assertIn({'start': 0, 'end': 14, 'name': 'alignment', 'value': 'center'}, annotations[9]) - self.assertIn({'start': 0, 'end': 11, 'name': 'alignment', 'value': 'right'}, annotations[10]) - self.assertIn({'start': 0, 'end': 29, 'name': 'alignment', 'value': 'both'}, annotations[11]) + self.assertIn({"start": 0, "end": 10, "name": "alignment", "value": "left"}, annotations[8]) + self.assertIn({"start": 0, "end": 14, "name": "alignment", "value": "center"}, annotations[9]) + self.assertIn({"start": 0, "end": 11, "name": "alignment", "value": "right"}, annotations[10]) + self.assertIn({"start": 0, "end": 29, "name": "alignment", "value": "both"}, annotations[11]) # indent - self.assertIn({'start': 0, 'end': 12, 'name': 'indentation', 'value': '0'}, annotations[12]) - self.assertIn({'start': 0, 'end': 11, 'name': 'indentation', 'value': '720.0'}, annotations[13]) - self.assertIn({'start': 0, 'end': 12, 'name': 'indentation', 'value': '1440.0'}, annotations[14]) + self.assertIn({"start": 0, "end": 12, "name": "indentation", "value": "0"}, annotations[12]) + self.assertIn({"start": 0, "end": 11, "name": "indentation", "value": "720.0"}, annotations[13]) + self.assertIn({"start": 0, "end": 12, "name": "indentation", "value": "1440.0"}, annotations[14]) # strike - self.assertIn({'start': 0, 'end': 11, 'name': 'strike', 'value': 'True'}, annotations[15]) + self.assertIn({"start": 0, "end": 11, "name": "strike", "value": "True"}, annotations[15]) def test_example_2(self) -> None: result = self._send_request("example_2.docx", data={"structure_type": "linear"}) - subparagraphs = result['content']['structure']['subparagraphs'] - annotations = [subparagraph['annotations'] for subparagraph in subparagraphs] + subparagraphs = result["content"]["structure"]["subparagraphs"] + annotations = [subparagraph["annotations"] for subparagraph in subparagraphs] # heading, italic, bold, underlined - self.assertIn({'start': 0, 'end': 31, 'name': 'italic', 'value': 'True'}, annotations[3]) - self.assertIn({'start': 0, 'end': 31, 'name': 'style', 'value': 'heading 4'}, annotations[3]) - self.assertIn({'start': 0, 'end': 29, 'name': 'italic', 'value': 'True'}, annotations[8]) - self.assertIn({'start': 0, 'end': 29, 'name': 'style', 'value': 'heading 9'}, annotations[8]) - self.assertIn({'start': 66, 'end': 73, 'name': 'italic', 'value': 'True'}, annotations[35]) - self.assertIn({'start': 75, 'end': 89, 'name': 'bold', 'value': 'True'}, annotations[35]) - self.assertIn({'start': 91, 'end': 111, 'name': 'underlined', 'value': 'True'}, annotations[35]) - self.assertIn({'start': 0, 'end': 153, 'name': 'size', 'value': '14.0'}, annotations[35]) - self.assertIn({'start': 153, 'end': 175, 'name': 'size', 'value': '20.0'}, annotations[35]) - self.assertIn({'start': 183, 'end': 199, 'name': 'size', 'value': '11.0'}, annotations[35]) + self.assertIn({"start": 0, "end": 31, "name": "italic", "value": "True"}, annotations[3]) + self.assertIn({"start": 0, "end": 31, "name": "style", "value": "heading 4"}, annotations[3]) + self.assertIn({"start": 0, "end": 29, "name": "italic", "value": "True"}, annotations[8]) + self.assertIn({"start": 0, "end": 29, "name": "style", "value": "heading 9"}, annotations[8]) + self.assertIn({"start": 66, "end": 73, "name": "italic", "value": "True"}, annotations[35]) + self.assertIn({"start": 75, "end": 89, "name": "bold", "value": "True"}, annotations[35]) + self.assertIn({"start": 91, "end": 111, "name": "underlined", "value": "True"}, annotations[35]) + self.assertIn({"start": 0, "end": 153, "name": "size", "value": "14.0"}, annotations[35]) + self.assertIn({"start": 153, "end": 175, "name": "size", "value": "20.0"}, annotations[35]) + self.assertIn({"start": 183, "end": 199, "name": "size", "value": "11.0"}, annotations[35]) # alignment - self.assertIn({'start': 0, 'end': 46, 'name': 'alignment', 'value': 'right'}, annotations[43]) - self.assertIn({'start': 0, 'end': 40, 'name': 'alignment', 'value': 'center'}, annotations[44]) - self.assertIn({'start': 0, 'end': 160, 'name': 'alignment', 'value': 'both'}, annotations[45]) + self.assertIn({"start": 0, "end": 46, "name": "alignment", "value": "right"}, annotations[43]) + self.assertIn({"start": 0, "end": 40, "name": "alignment", "value": "center"}, annotations[44]) + self.assertIn({"start": 0, "end": 160, "name": "alignment", "value": "both"}, annotations[45]) # bold, italic, underlined - self.assertIn({'start': 0, 'end': 26, 'name': 'bold', 'value': 'True'}, annotations[47]) - self.assertIn({'start': 0, 'end': 29, 'name': 'italic', 'value': 'True'}, annotations[48]) - self.assertIn({'start': 0, 'end': 32, 'name': 'underlined', 'value': 'True'}, annotations[49]) - self.assertIn({'start': 0, 'end': 35, 'name': 'bold', 'value': 'True'}, annotations[50]) - self.assertIn({'start': 0, 'end': 35, 'name': 'italic', 'value': 'True'}, annotations[50]) - self.assertIn({'start': 0, 'end': 51, 'name': 'bold', 'value': 'True'}, annotations[51]) - self.assertIn({'start': 0, 'end': 51, 'name': 'underlined', 'value': 'True'}, annotations[51]) - self.assertIn({'start': 0, 'end': 51, 'name': 'italic', 'value': 'True'}, annotations[51]) + self.assertIn({"start": 0, "end": 26, "name": "bold", "value": "True"}, annotations[47]) + self.assertIn({"start": 0, "end": 29, "name": "italic", "value": "True"}, annotations[48]) + self.assertIn({"start": 0, "end": 32, "name": "underlined", "value": "True"}, annotations[49]) + self.assertIn({"start": 0, "end": 35, "name": "bold", "value": "True"}, annotations[50]) + self.assertIn({"start": 0, "end": 35, "name": "italic", "value": "True"}, annotations[50]) + self.assertIn({"start": 0, "end": 51, "name": "bold", "value": "True"}, annotations[51]) + self.assertIn({"start": 0, "end": 51, "name": "underlined", "value": "True"}, annotations[51]) + self.assertIn({"start": 0, "end": 51, "name": "italic", "value": "True"}, annotations[51]) def test_spacing_1(self) -> None: result = self._send_request("spacing_libreoffice.docx", data={"structure_type": "linear"}) - subparagraphs = result['content']['structure']['subparagraphs'] - annotations = [subparagraph['annotations'] for subparagraph in subparagraphs] - - self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '0'}, annotations[0]) - self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '0'}, annotations[1]) - self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '57'}, annotations[2]) - self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '114'}, annotations[3]) - self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '114'}, annotations[4]) - self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '114'}, annotations[5]) - self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '114'}, annotations[6]) - self.assertIn({'start': 0, 'end': 9, 'name': 'spacing', 'value': '0'}, annotations[7]) + subparagraphs = result["content"]["structure"]["subparagraphs"] + annotations = [subparagraph["annotations"] for subparagraph in subparagraphs] + + self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "0"}, annotations[0]) + self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "0"}, annotations[1]) + self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "57"}, annotations[2]) + self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "114"}, annotations[3]) + self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "114"}, annotations[4]) + self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "114"}, annotations[5]) + self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "114"}, annotations[6]) + self.assertIn({"start": 0, "end": 9, "name": "spacing", "value": "0"}, annotations[7]) def test_spacing_2(self) -> None: result = self._send_request("spacing_microsoft_word.docx", data={"structure_type": "linear"}) - subparagraphs = result['content']['structure']['subparagraphs'] - annotations = [subparagraph['annotations'] for subparagraph in subparagraphs] - - self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '0'}, annotations[0]) - self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '0'}, annotations[1]) - self.assertIn({'start': 0, 'end': 31, 'name': 'spacing', 'value': '200'}, annotations[2]) - self.assertIn({'start': 0, 'end': 31, 'name': 'spacing', 'value': '200'}, annotations[3]) - self.assertIn({'start': 0, 'end': 32, 'name': 'spacing', 'value': '400'}, annotations[4]) - self.assertIn({'start': 0, 'end': 31, 'name': 'spacing', 'value': '400'}, annotations[5]) - self.assertIn({'start': 0, 'end': 31, 'name': 'spacing', 'value': '600'}, annotations[6]) - self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '400'}, annotations[7]) - self.assertIn({'start': 0, 'end': 10, 'name': 'spacing', 'value': '0'}, annotations[8]) + subparagraphs = result["content"]["structure"]["subparagraphs"] + annotations = [subparagraph["annotations"] for subparagraph in subparagraphs] + + self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "0"}, annotations[0]) + self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "0"}, annotations[1]) + self.assertIn({"start": 0, "end": 31, "name": "spacing", "value": "200"}, annotations[2]) + self.assertIn({"start": 0, "end": 31, "name": "spacing", "value": "200"}, annotations[3]) + self.assertIn({"start": 0, "end": 32, "name": "spacing", "value": "400"}, annotations[4]) + self.assertIn({"start": 0, "end": 31, "name": "spacing", "value": "400"}, annotations[5]) + self.assertIn({"start": 0, "end": 31, "name": "spacing", "value": "600"}, annotations[6]) + self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "400"}, annotations[7]) + self.assertIn({"start": 0, "end": 10, "name": "spacing", "value": "0"}, annotations[8]) def test_identation(self) -> None: result = self._send_request("indentation_libreoffice.docx", data={"structure_type": "linear"}) - subparagraphs = result['content']['structure']['subparagraphs'] - annotations = [subparagraph['annotations'] for subparagraph in subparagraphs] - self.assertIn({'start': 0, 'end': 188, 'name': 'indentation', 'value': '360.0'}, annotations[5]) - self.assertIn({'start': 0, 'end': 152, 'name': 'indentation', 'value': '708.0'}, annotations[10]) - self.assertIn({'start': 0, 'end': 0, 'name': 'indentation', 'value': '1429.0'}, annotations[12]) - self.assertIn({'start': 0, 'end': 21, 'name': 'indentation', 'value': '709.0'}, annotations[16]) - self.assertIn({'start': 0, 'end': 65, 'name': 'indentation', 'value': '786.0'}, annotations[20]) + subparagraphs = result["content"]["structure"]["subparagraphs"] + annotations = [subparagraph["annotations"] for subparagraph in subparagraphs] + self.assertIn({"start": 0, "end": 188, "name": "indentation", "value": "360.0"}, annotations[5]) + self.assertIn({"start": 0, "end": 152, "name": "indentation", "value": "708.0"}, annotations[10]) + self.assertIn({"start": 0, "end": 0, "name": "indentation", "value": "1429.0"}, annotations[12]) + self.assertIn({"start": 0, "end": 21, "name": "indentation", "value": "709.0"}, annotations[16]) + self.assertIn({"start": 0, "end": 65, "name": "indentation", "value": "786.0"}, annotations[20]) def test_table_refs(self) -> None: result = self._send_request("table_refs.docx", data={"structure_type": "linear"}) - subparagraphs = result['content']['structure']['subparagraphs'] + subparagraphs = result["content"]["structure"]["subparagraphs"] for i in [0, 2, 4, 6, 9]: - annotations = subparagraphs[i]['annotations'] + annotations = subparagraphs[i]["annotations"] found = False for annotation in annotations: if annotation["name"] == "table": diff --git a/tests/api_tests/test_api_format_email.py b/tests/api_tests/test_api_format_email.py index 055c86fe..0846538f 100644 --- a/tests/api_tests/test_api_format_email.py +++ b/tests/api_tests/test_api_format_email.py @@ -14,7 +14,7 @@ def test_email_file(self) -> None: attachments = result["attachments"] self.assertEqual(len(attachments), 1) # message header fields - self.assertIn("message_header_", attachments[0]['metadata']['file_name']) + self.assertIn("message_header_", attachments[0]["metadata"]["file_name"]) content = result["content"] structure = content["structure"] self._check_tree_sanity(structure) diff --git a/tests/api_tests/test_api_format_excel.py b/tests/api_tests/test_api_format_excel.py index 5cc06bca..aec440c3 100644 --- a/tests/api_tests/test_api_format_excel.py +++ b/tests/api_tests/test_api_format_excel.py @@ -72,19 +72,19 @@ def __check_content_formulas(self, tables: List[dict]) -> None: self.assertEqual(2, len(tables)) table1, table2 = (table["cells"] for table in tables) - self.assertListEqual(['a', 'b', 'c'], table1[0]) - self.assertListEqual(['1.0', '2.0', '3.0'], table1[1]) - self.assertListEqual(['3.0', '4.0', '7.0'], table1[2]) - self.assertListEqual(['2.0', '3.0', '5.0'], table1[3]) - self.assertListEqual(['5.0', '6.0', '11.0'], table1[4]) - self.assertListEqual(['7.0', '33.0', '40.0'], table1[5]) - - self.assertListEqual(['r', 'p', 's', 'pi'], table2[0]) - self.assertListEqual(['1.0', '6.28', '3.14', '3.14'], table2[1]) - self.assertListEqual(['2.0', '12.56', '12.56', ''], table2[2]) - self.assertListEqual(['3.0', '18.84', '28.26', ''], table2[3]) - self.assertListEqual(['4.0', '25.12', '50.24', ''], table2[4]) - self.assertListEqual(['5.0', '31.4', '78.5', ''], table2[5]) - self.assertListEqual(['6.0', '37.68', '113.04', ''], table2[6]) - self.assertListEqual(['7.0', '43.96', '153.86', ''], table2[7]) - self.assertListEqual(['8.0', '50.24', '200.96', ''], table2[8]) + self.assertListEqual(["a", "b", "c"], table1[0]) + self.assertListEqual(["1.0", "2.0", "3.0"], table1[1]) + self.assertListEqual(["3.0", "4.0", "7.0"], table1[2]) + self.assertListEqual(["2.0", "3.0", "5.0"], table1[3]) + self.assertListEqual(["5.0", "6.0", "11.0"], table1[4]) + self.assertListEqual(["7.0", "33.0", "40.0"], table1[5]) + + self.assertListEqual(["r", "p", "s", "pi"], table2[0]) + self.assertListEqual(["1.0", "6.28", "3.14", "3.14"], table2[1]) + self.assertListEqual(["2.0", "12.56", "12.56", ""], table2[2]) + self.assertListEqual(["3.0", "18.84", "28.26", ""], table2[3]) + self.assertListEqual(["4.0", "25.12", "50.24", ""], table2[4]) + self.assertListEqual(["5.0", "31.4", "78.5", ""], table2[5]) + self.assertListEqual(["6.0", "37.68", "113.04", ""], table2[6]) + self.assertListEqual(["7.0", "43.96", "153.86", ""], table2[7]) + self.assertListEqual(["8.0", "50.24", "200.96", ""], table2[8]) diff --git a/tests/api_tests/test_api_format_html.py b/tests/api_tests/test_api_format_html.py index 5df163c7..8dd3a748 100644 --- a/tests/api_tests/test_api_format_html.py +++ b/tests/api_tests/test_api_format_html.py @@ -1,7 +1,6 @@ import os from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation - from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -53,10 +52,10 @@ def __check_example_file(self, result: dict, file_name: str) -> None: node = self._get_by_tree_path(tree, "0.0.0.1.0") self.assertEqual("raw_text", node["metadata"]["paragraph_type"]) self.assertEqual("Определим определения \nТекст ", node["text"].strip()[:30]) - self.assertIn({'start': 1, 'end': 31, 'name': 'bold', 'value': 'True'}, node["annotations"]) - self.assertIn({'start': 46, 'end': 52, 'name': 'bold', 'value': 'True'}, node["annotations"]) - self.assertIn({'start': 42, 'end': 45, 'name': 'underlined', 'value': 'True'}, node["annotations"]) - self.assertIn({'start': 32, 'end': 42, 'name': 'italic', 'value': 'True'}, node["annotations"]) + self.assertIn({"start": 1, "end": 31, "name": "bold", "value": "True"}, node["annotations"]) + self.assertIn({"start": 46, "end": 52, "name": "bold", "value": "True"}, node["annotations"]) + self.assertIn({"start": 42, "end": 45, "name": "underlined", "value": "True"}, node["annotations"]) + self.assertIn({"start": 32, "end": 42, "name": "italic", "value": "True"}, node["annotations"]) node = self._get_by_tree_path(tree, "0.0.0.2") self.assertEqual("header", node["metadata"]["paragraph_type"]) @@ -109,7 +108,7 @@ def __check_example_file(self, result: dict, file_name: str) -> None: self.assertListEqual(["Петров", "Пётр", "Петрович"], table2["cells"][2]) self.assertListEqual(["Сидоров", "Сидор", "Сидорович"], table2["cells"][3]) - self.__check_metainfo(result['metadata'], 'text/html', file_name) + self.__check_metainfo(result["metadata"], "text/html", file_name) def test_part_html(self) -> None: file_name = "part.html" @@ -118,11 +117,9 @@ def test_part_html(self) -> None: content = result["content"]["structure"] self._check_tree_sanity(content) self.assertEqual("Лесные слоны", content["subparagraphs"][0]["text"].strip()) - self.assertEqual("В данном разделе мы поговорим о малоизвестных лесных слонах...", - content["subparagraphs"][0]["subparagraphs"][0]["text"].strip()) + self.assertEqual("В данном разделе мы поговорим о малоизвестных лесных слонах...", content["subparagraphs"][0]["subparagraphs"][0]["text"].strip()) self.assertEqual("Среда обитания", content["subparagraphs"][0]["subparagraphs"][1]["text"].strip()) - self.assertEqual("Лесные слоны живут не на деревьях, а под ними.", - content["subparagraphs"][0]["subparagraphs"][1]["subparagraphs"][0]["text"].strip()) + self.assertEqual("Лесные слоны живут не на деревьях, а под ними.", content["subparagraphs"][0]["subparagraphs"][1]["subparagraphs"][0]["text"].strip()) def test_plain_text_html(self) -> None: file_name = "plain.html" @@ -173,15 +170,15 @@ def test_html_newlines(self) -> None: self.assertIn("For service repair (Part 145) returned material authorizations (RMA):", text) def __check_metainfo(self, metainfo: dict, actual_type: str, actual_name: str) -> None: - self.assertEqual(metainfo['file_type'], actual_type) - self.assertEqual(metainfo['file_name'], actual_name) + self.assertEqual(metainfo["file_type"], actual_type) + self.assertEqual(metainfo["file_name"], actual_name) def test_html_encoding(self) -> None: file_name = "53.html" result = self._send_request(file_name) content = result["content"]["structure"] text = content["subparagraphs"][0]["text"] - self.assertTrue(text.startswith('\n\n')) + self.assertTrue(text.startswith("\n\n")) def test_html_no_newline(self) -> None: file_name = "no_new_line.html" @@ -192,8 +189,7 @@ def test_html_no_newline(self) -> None: expected_text = ('"I can’t bring myself to feel too sorry for Amazon or textbook publishers, given how much ' 'they tend to gouge on the prices of those books."') self.assertEqual(expected_text, text.strip()) - italics = [text[annotation["start"]: annotation["end"]] for annotation in node["annotations"] if - annotation["name"] == "italic"] + italics = [text[annotation["start"]: annotation["end"]] for annotation in node["annotations"] if annotation["name"] == "italic"] self.assertIn("or", italics) def test_html_none_display(self) -> None: diff --git a/tests/api_tests/test_api_format_json.py b/tests/api_tests/test_api_format_json.py index 99968320..72128afc 100644 --- a/tests/api_tests/test_api_format_json.py +++ b/tests/api_tests/test_api_format_json.py @@ -81,7 +81,7 @@ def test_broken(self) -> None: def test_json_attachments2(self) -> None: file_name = "test2.json" - data = {'html_fields': '[["e"], ["f"]]', 'with_attachments': 'True', 'return_base64': 'true'} + data = {"html_fields": '[["e"], ["f"]]', "with_attachments": "True", "return_base64": "true"} self._send_request(file_name, expected_code=200, data=data) def test_json_null(self) -> None: @@ -104,7 +104,7 @@ def test_json_null(self) -> None: def test_json_broken_parameters(self) -> None: file_name = "test2.json" - data = {'html_fields': '[[ef]]', 'with_attachments': 'True', 'return_base64': 'true'} + data = {"html_fields": "[[ef]]", "with_attachments": "True", "return_base64": "true"} with self.assertRaises(JSONDecodeError): json.loads(data["html_fields"]) self._send_request(file_name, expected_code=400, data=data) diff --git a/tests/api_tests/test_api_format_pdf.py b/tests/api_tests/test_api_format_pdf.py index 2e49beba..5f766472 100644 --- a/tests/api_tests/test_api_format_pdf.py +++ b/tests/api_tests/test_api_format_pdf.py @@ -1,10 +1,10 @@ import os -from tests.api_tests.abstract_api_test import AbstractTestApiDocReader from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation from dedoc.utils import supported_image_types +from tests.api_tests.abstract_api_test import AbstractTestApiDocReader class TestApiPdfReader(AbstractTestApiDocReader): @@ -23,26 +23,25 @@ def __check_example_file(self, result: dict) -> None: self._check_similarity("1.2.1 Поясним за непонятное", content[3]["subparagraphs"][0]["text"]) def __check_metainfo(self, metainfo: dict, actual_type: str, actual_name: str) -> None: - self.assertEqual(metainfo['file_type'], actual_type) - self.assertEqual(metainfo['file_name'], actual_name) + self.assertEqual(metainfo["file_type"], actual_type) + self.assertEqual(metainfo["file_name"], actual_name) def test_pdf(self) -> None: file_name = "example.pdf" result = self._send_request(file_name, data=dict(with_attachments=True, document_type="", pdf_with_text_layer="false")) self.__check_example_file(result) - self.__check_metainfo(result['metadata'], 'application/pdf', file_name) - self.assertEqual([], result['attachments']) + self.__check_metainfo(result["metadata"], "application/pdf", file_name) + self.assertEqual([], result["attachments"]) def test_djvu(self) -> None: file_name = "example_with_table7.djvu" result = self._send_request(file_name, dict(document_type="")) tree = result["content"]["structure"] self._check_tree_sanity(tree) - self.assertEqual('2. Срок поставки в течении 70 дней с момента внесения авансового платежа.\n', - self._get_by_tree_path(tree, "0.2.1")['text']) - self.assertEqual("3. Срок изготовления не ранее 2018г.\n", self._get_by_tree_path(tree, "0.2.2")['text']) + self.assertEqual("2. Срок поставки в течении 70 дней с момента внесения авансового платежа.\n", self._get_by_tree_path(tree, "0.2.1")["text"]) + self.assertEqual("3. Срок изготовления не ранее 2018г.\n", self._get_by_tree_path(tree, "0.2.2")["text"]) - self.__check_metainfo(result['metadata'], 'image/vnd.djvu', file_name) + self.__check_metainfo(result["metadata"], "image/vnd.djvu", file_name) def test_djvu_2(self) -> None: file_name = "example_with_table9.djvu" @@ -52,7 +51,7 @@ def test_djvu_2(self) -> None: self.assertEqual("1. Предмет закупки, источник финансирования :\n", self._get_by_tree_path(content, "0.1.0")["text"]) self.assertEqual("2. Место выполнения Работ:\n", self._get_by_tree_path(content, "0.1.1")["text"]) - self.__check_metainfo(result['metadata'], 'image/vnd.djvu', file_name) + self.__check_metainfo(result["metadata"], "image/vnd.djvu", file_name) def test_broken_djvu(self) -> None: file_name = "broken.djvu" @@ -75,13 +74,13 @@ def test_header_pdf(self) -> None: self._check_similarity("4.5. п", self._get_by_tree_path(tree, "0.1.3.0.4")["text"]) self._check_similarity("4.6. п", self._get_by_tree_path(tree, "0.1.3.0.5")["text"]) - self.__check_metainfo(result['metadata'], 'application/pdf', file_name) + self.__check_metainfo(result["metadata"], "application/pdf", file_name) def test_images(self) -> None: formats = [ - '.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif', '.ppm', '.pnm', '.pgm', - '.pbm', '.webp', '.pcx', '.eps', '.sgi', '.hdr', '.pic', '.sr', '.ras', - '.dib', '.jpe', '.jfif', '.j2k' + ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif", ".ppm", ".pnm", ".pgm", + ".pbm", ".webp", ".pcx", ".eps", ".sgi", ".hdr", ".pic", ".sr", ".ras", + ".dib", ".jpe", ".jfif", ".j2k" ] for image_format in formats: @@ -101,35 +100,34 @@ def test_image_binarization(self) -> None: result = self._send_request("01_МФО_Наклон.jpg", data=dict(need_binarization="true")) self.assertIn("ЦЕНТРАЛЬНЫЙ БАНК РОССИЙСКОЙ ФЕДЕРАЦИИ\n{БАНК РОССИИ)\nСВИДЕТЕЛЬСТВО\nО ВНЕСЕНИИ СВЕДЕНИЙ О ЮРИДИЧЕСКОМ ЛИЦЕ\n" - "В ГОСУДАРСТВЕННЫЙ РЕЕСТР МИКРОФИНАНСОВЫХ ОРГАНИЗАЦИЙ", result['content']['structure']['subparagraphs'][0]['text']) - self.assertIn("Е.И Курицына\n(расшифровка подлиси", result['content']['structure']['subparagraphs'][1]['text']) + "В ГОСУДАРСТВЕННЫЙ РЕЕСТР МИКРОФИНАНСОВЫХ ОРГАНИЗАЦИЙ", result["content"]["structure"]["subparagraphs"][0]["text"]) + self.assertIn("Е.И Курицына\n(расшифровка подлиси", result["content"]["structure"]["subparagraphs"][1]["text"]) def test_on_ocr_conf_threshold(self) -> None: result = self._send_request("with_trash.jpg", data=dict(structure_type="tree")) tree = result["content"]["structure"] self._check_tree_sanity(tree) # check, that handwritten text was filtered - self._check_similarity('ФИО года рождения, паспорт: серия \n№ выдан _, дата выдачи\n' - 'т. код подразделения зарегистрированный по адресу:\n \n', tree['subparagraphs'][3]['text']) + self._check_similarity("ФИО года рождения, паспорт: серия \n№ выдан _, дата выдачи\nт. код подразделения зарегистрированный по адресу:\n \n", + tree["subparagraphs"][3]["text"]) def test_rotated_image(self) -> None: result = self._send_request("orient_1.png", data=dict(need_pdf_table_analysis="false")) tree = result["content"]["structure"] self._check_tree_sanity(tree) - self._check_similarity(tree['subparagraphs'][0]['text'], 'Приложение к Положению о порядке\n' - 'формирования, ведения и утверждения\n' - 'ведомственных перечней государственных услуг\n' - 'и работ, оказываемых и выполняемых\n' - 'государственными учреждениями Калужской\n' - 'области\n') + self._check_similarity(tree["subparagraphs"][0]["text"], "Приложение к Положению о порядке\n" + "формирования, ведения и утверждения\n" + "ведомственных перечней государственных услуг\n" + "и работ, оказываемых и выполняемых\n" + "государственными учреждениями Калужской\n" + "области\n") def test_pdf_with_only_mp_table(self) -> None: file_name = os.path.join("..", "tables", "multipage_table.pdf") result = self._send_request(file_name) - table_refs = [ann["value"] for ann in result["content"]["structure"]["subparagraphs"][0]["annotations"] - if ann["name"] == "table"] + table_refs = [ann["value"] for ann in result["content"]["structure"]["subparagraphs"][0]["annotations"] if ann["name"] == "table"] self.assertTrue(len(result["content"]["tables"]), len(table_refs)) for table in result["content"]["tables"]: @@ -145,15 +143,14 @@ def test_pdf_with_some_tables(self) -> None: # checks indentations par = self._get_by_tree_path(tree, "0.4.0.0") annotations = par["annotations"] - self.assertIn({"end": 170, 'value': '600', 'name': 'indentation', 'start': 0}, annotations) + self.assertIn({"end": 170, "value": "600", "name": "indentation", "start": 0}, annotations) self.assertIn("Методика расчета ВВП по доходам характеризуется суммой национального\n", par["text"]) def test_pdf_with_only_table(self) -> None: file_name = os.path.join("..", "pdf_with_text_layer", "VVP_global_table.pdf") result = self._send_request(file_name) - self.assertTrue(result["content"]["tables"][0]["metadata"]["uid"] == - result["content"]["structure"]["subparagraphs"][0]["annotations"][0]["value"]) + self.assertEqual(result["content"]["tables"][0]["metadata"]["uid"], result["content"]["structure"]["subparagraphs"][0]["annotations"][0]["value"]) def test_2_columns(self) -> None: file_name = os.path.join("..", "scanned", "example_2_columns.png") @@ -166,8 +163,8 @@ def test_document_orientation(self) -> None: file_name = "orient_3.png" result = self._send_request(file_name, data=dict(document_orientation="auto")) tree = result["content"]["structure"] - self._check_similarity(tree['subparagraphs'][0]['text'], 'Приложение к постановлению\n' - 'Губернатора Камчатского края\n' - '0729.12.2014 № 168\n' + self._check_similarity(tree["subparagraphs"][0]["text"], "Приложение к постановлению\n" + "Губернатора Камчатского края\n" + "0729.12.2014 № 168\n" '"БУРЫЙ МЕДВЕДЬ\n' - '{вид охотничьих ресурсов)\n') + "{вид охотничьих ресурсов)\n") diff --git a/tests/api_tests/test_api_format_pdf_page_limit.py b/tests/api_tests/test_api_format_pdf_page_limit.py index d2533d64..ee1bf841 100644 --- a/tests/api_tests/test_api_format_pdf_page_limit.py +++ b/tests/api_tests/test_api_format_pdf_page_limit.py @@ -65,9 +65,9 @@ def __check(self, pages: str, text_expected: str, reader: str, check_partially: result = self._send_request("multipage.pdf", params) if check_partially: self.assertIn("The document is partially parsed", result["warnings"]) - self.assertIn('first_page', result['metadata']) - self.assertIn('last_page', result['metadata']) + self.assertIn("first_page", result["metadata"]) + self.assertIn("last_page", result["metadata"]) tree = result["content"]["structure"] node = self._get_by_tree_path(tree, "0.0") text = node["text"].strip() - self.assertEqual(text_expected, text, "{} and {}".format(pages, reader)) + self.assertEqual(text_expected, text, f"{pages} and {reader}") diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py index b273fef8..67d66b5a 100644 --- a/tests/api_tests/test_api_format_pdf_tabby_reader.py +++ b/tests/api_tests/test_api_format_pdf_tabby_reader.py @@ -95,26 +95,26 @@ def test_pdf_with_text_style(self) -> None: tree = result["content"]["structure"] self._check_tree_sanity(tree=tree) sub1 = self._get_by_tree_path(tree, "0.0.0") - self.assertEqual('1.1 TimesNewRomanItalicBold20\n', sub1["text"]) - self.assertIn({'start': 0, 'end': 29, "name": "size", 'value': '20'}, sub1['annotations']) + self.assertEqual("1.1 TimesNewRomanItalicBold20\n", sub1["text"]) + self.assertIn({"start": 0, "end": 29, "name": "size", "value": "20"}, sub1["annotations"]) sub1sub1 = self._get_by_tree_path(tree, "0.0.0.0") - self.assertEqual('Different styles(Arial16):\n', sub1sub1['text']) - self.assertIn({'start': 0, 'end': 26, "name": "size", 'value': '15'}, sub1sub1['annotations']) + self.assertEqual("Different styles(Arial16):\n", sub1sub1["text"]) + self.assertIn({"start": 0, "end": 26, "name": "size", "value": "15"}, sub1sub1["annotations"]) sub2 = self._get_by_tree_path(tree, "0.1.0") - self.assertEqual('1. TimesNewRoman18\n', sub2['text']) - self.assertIn({'start': 3, 'end': 18, "name": "size", 'value': '18'}, sub2['annotations']) + self.assertEqual("1. TimesNewRoman18\n", sub2["text"]) + self.assertIn({"start": 3, "end": 18, "name": "size", "value": "18"}, sub2["annotations"]) sub3 = self._get_by_tree_path(tree, "0.1.1") - self.assertEqual('2. TimesNewRoman9, TimesNewRomanBold7.5, TimesNewRoman6.5\n', sub3['text']) - self.assertIn({'start': 3, 'end': 18, "name": "size", 'value': '9'}, sub3['annotations']) - self.assertIn({'start': 19, 'end': 57, "name": "size", 'value': '6'}, sub3['annotations']) + self.assertEqual("2. TimesNewRoman9, TimesNewRomanBold7.5, TimesNewRoman6.5\n", sub3["text"]) + self.assertIn({"start": 3, "end": 18, "name": "size", "value": "9"}, sub3["annotations"]) + self.assertIn({"start": 19, "end": 57, "name": "size", "value": "6"}, sub3["annotations"]) sub4 = self._get_by_tree_path(tree, "0.1.2") - self.assertEqual('3. TimesNewRomanItalic14, Calibri18, Tahoma16\n', sub4['text']) - self.assertIn({'start': 3, 'end': 25, "name": "size", 'value': '14'}, sub4['annotations']) - self.assertIn({'start': 26, 'end': 36, "name": "size", 'value': '18'}, sub4['annotations']) + self.assertEqual("3. TimesNewRomanItalic14, Calibri18, Tahoma16\n", sub4["text"]) + self.assertIn({"start": 3, "end": 25, "name": "size", "value": "14"}, sub4["annotations"]) + self.assertIn({"start": 26, "end": 36, "name": "size", "value": "18"}, sub4["annotations"]) def test_tables2(self) -> None: file_name = "VVP_global_table.pdf" diff --git a/tests/api_tests/test_api_format_pdf_with_text.py b/tests/api_tests/test_api_format_pdf_with_text.py index 54265beb..407d97af 100644 --- a/tests/api_tests/test_api_format_pdf_with_text.py +++ b/tests/api_tests/test_api_format_pdf_with_text.py @@ -16,71 +16,69 @@ def __filter_by_name(self, annotations: List[dict], name: str) -> List[dict]: @unittest.skip("TODO") def test_pdf_with_text_style(self) -> None: file_name = "diff_styles.pdf" - result = self._send_request(file_name, dict(pdf_with_text_layer="true", document_type="", - need_pdf_table_analysis="false")) + result = self._send_request(file_name, dict(pdf_with_text_layer="true", document_type="", need_pdf_table_analysis="false")) tree = result["content"]["structure"] self._check_tree_sanity(tree) node = self._get_by_tree_path(tree, "0.0") - self.assertEqual('1.1TimesNewRomanItalicBold20\n', node['text']) - self.assertIn({'start': 0, 'end': 28, "name": "size", 'value': '20.0'}, node['annotations']) + self.assertEqual("1.1TimesNewRomanItalicBold20\n", node["text"]) + self.assertIn({"start": 0, "end": 28, "name": "size", "value": "20.0"}, node["annotations"]) node = self._get_by_tree_path(tree, "0.1") - annotations_size = self.__filter_by_name(name="size", annotations=node['annotations']) - self.assertIn({'start': 0, 'end': 26, "name": "size", 'value': '16.0'}, annotations_size) - self.assertEqual(len(node['annotations']), 5) + annotations_size = self.__filter_by_name(name="size", annotations=node["annotations"]) + self.assertIn({"start": 0, "end": 26, "name": "size", "value": "16.0"}, annotations_size) + self.assertEqual(len(node["annotations"]), 5) self.assertEqual("Different styles(Arial16):\n", node["text"]) node = self._get_by_tree_path(tree, "0.2.2") - self.assertEqual('3. TimesNewRomanItalic14, Calibri18, Tahoma16\n', node['text']) - self.assertEqual('3. ', node['text'][0:3]) - self.assertIn({'start': 0, 'end': 36, 'name': "style", 'value': 'TimesNewRomanPSMT'}, node['annotations']) - self.assertIn({'start': 0, 'end': 2, "name": "size", 'value': '16.0'}, node['annotations']) - self.assertEqual('TimesNewRomanItalic14, ', node['text'][3:26]) - self.assertIn({'start': 0, 'end': 36, "name": "style", 'value': 'TimesNewRomanPSMT'}, node['annotations']) - self.assertIn({'start': 3, 'end': 25, "name": "size", 'value': '14.0'}, node['annotations']) - self.assertEqual('Calibri18, ', node['text'][26:37]) - self.assertIn({'start': 0, 'end': 36, "name": "style", 'value': 'TimesNewRomanPSMT'}, node['annotations']) - self.assertIn({'start': 26, 'end': 36, 'value': '18.0', "name": "size"}, node['annotations']) - self.assertEqual('Tahoma16\n', node['text'][37:46]) - self.assertIn({'start': 37, 'end': 45, 'value': 'Tahoma', "name": "style"}, node['annotations']) - self.assertIn({'start': 37, 'end': 45, "name": "size", 'value': '16.0'}, node['annotations']) - self.assertEqual(9, len(node['annotations'])) + self.assertEqual("3. TimesNewRomanItalic14, Calibri18, Tahoma16\n", node["text"]) + self.assertEqual("3. ", node["text"][0:3]) + self.assertIn({"start": 0, "end": 36, "name": "style", "value": "TimesNewRomanPSMT"}, node["annotations"]) + self.assertIn({"start": 0, "end": 2, "name": "size", "value": "16.0"}, node["annotations"]) + self.assertEqual("TimesNewRomanItalic14, ", node["text"][3:26]) + self.assertIn({"start": 0, "end": 36, "name": "style", "value": "TimesNewRomanPSMT"}, node["annotations"]) + self.assertIn({"start": 3, "end": 25, "name": "size", "value": "14.0"}, node["annotations"]) + self.assertEqual("Calibri18, ", node["text"][26:37]) + self.assertIn({"start": 0, "end": 36, "name": "style", "value": "TimesNewRomanPSMT"}, node["annotations"]) + self.assertIn({"start": 26, "end": 36, "value": "18.0", "name": "size"}, node["annotations"]) + self.assertEqual("Tahoma16\n", node["text"][37:46]) + self.assertIn({"start": 37, "end": 45, "value": "Tahoma", "name": "style"}, node["annotations"]) + self.assertIn({"start": 37, "end": 45, "name": "size", "value": "16.0"}, node["annotations"]) + self.assertEqual(9, len(node["annotations"])) def test_pdf_with_text_style_2(self) -> None: file_name = "2-column-state.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="true", need_pdf_table_analysis="false")) tree = result["content"]["structure"] self._check_tree_sanity(tree) - subs = tree['subparagraphs'] + subs = tree["subparagraphs"] sub = self._get_by_tree_path(tree, "0.0") - self.assertEqual("Compromising Tor Anonymity\n", sub['text'][0:27]) - annotations_size = self.__filter_by_name(name="size", annotations=subs[0]['annotations']) - self.assertIn({'start': 0, 'end': 61, "name": "size", 'value': '18.0'}, annotations_size) + self.assertEqual("Compromising Tor Anonymity\n", sub["text"][0:27]) + annotations_size = self.__filter_by_name(name="size", annotations=subs[0]["annotations"]) + self.assertIn({"start": 0, "end": 61, "name": "size", "value": "18.0"}, annotations_size) - annotations_style = self.__filter_by_name(name="style", annotations=subs[0]['annotations']) - self.assertIn({'start': 0, 'end': 61, 'name': 'style', 'value': 'Helvetica-Bold'}, annotations_style) + annotations_style = self.__filter_by_name(name="style", annotations=subs[0]["annotations"]) + self.assertIn({"start": 0, "end": 61, "name": "style", "value": "Helvetica-Bold"}, annotations_style) - annotations_bold = self.__filter_by_name(name="bold", annotations=subs[0]['annotations']) - self.assertIn({'start': 0, 'end': 61, 'name': 'bold', 'value': "True"}, annotations_bold) + annotations_bold = self.__filter_by_name(name="bold", annotations=subs[0]["annotations"]) + self.assertIn({"start": 0, "end": 61, "name": "bold", "value": "True"}, annotations_bold) self.assertIn("Pere Manils, Abdelberi Chaabane, Stevens Le Blond,", self._get_by_tree_path(tree, "0.1")["text"]) @unittest.skip("TODO") def test_pdf_with_2_columns_text(self) -> None: file_name = "2-column-state.pdf" - result = self._send_request(file_name, dict(pdf_with_text_layer="true", document_type="", - need_pdf_table_analysis="false")) + result = self._send_request(file_name, dict(pdf_with_text_layer="true", document_type="", need_pdf_table_analysis="false")) tree = result["content"]["structure"] self._check_tree_sanity(tree) self.assertIn("Privacy of users in P2P networks goes far beyond their\n" "current usage and is a fundamental requirement to the adop-\n" "tion of P2P protocols for legal usage. In a climate of cold", - self._get_by_tree_path(tree, "0.5")['text']) + self._get_by_tree_path(tree, "0.5")["text"]) - self.assertIn("Keywords", self._get_by_tree_path(tree, "0.6")['text']) - self.assertIn("Anonymizing Networks, Privacy, Tor, BitTorrent", self._get_by_tree_path(tree, "0.7")['text']) + self.assertIn("Keywords", self._get_by_tree_path(tree, "0.6")["text"]) + self.assertIn("Anonymizing Networks, Privacy, Tor, BitTorrent", self._get_by_tree_path(tree, "0.7")["text"]) self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.8.0.0")["text"]) self.assertIn("The Tor network was designed to provide freedom\n" @@ -96,8 +94,7 @@ def test_pdf_with_2_columns_text_2(self) -> None: tree = result["content"]["structure"] self.assertIn("References", self._get_by_tree_path(tree, "0.0")["text"]) - self.assertIn("[1] Navaneeth Bodla, Bharat Singh, Rama Chellappa, and", - self._get_by_tree_path(tree, "0.1")["text"]) + self.assertIn("[1] Navaneeth Bodla, Bharat Singh, Rama Chellappa, and", self._get_by_tree_path(tree, "0.1")["text"]) def test_pdf_with_some_tables(self) -> None: file_name = "VVP_6_tables.pdf" @@ -109,22 +106,20 @@ def test_pdf_with_some_tables(self) -> None: # checks indentations par = self._get_by_tree_path(tree, "0.4.0.0") - self.assertIn({'end': 170, 'value': '600', 'name': 'indentation', 'start': 0}, par["annotations"]) + self.assertIn({"end": 170, "value": "600", "name": "indentation", "start": 0}, par["annotations"]) self.assertIn("Методика расчета ВВП по доходам характеризуется суммой национального\n", par["text"]) def test_pdf_with_only_table(self) -> None: file_name = "VVP_global_table.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="true")) - self.assertTrue(result["content"]["tables"][0]["metadata"]["uid"] == - result["content"]["structure"]["subparagraphs"][0]["annotations"][0]["value"]) + self.assertEqual(result["content"]["tables"][0]["metadata"]["uid"], result["content"]["structure"]["subparagraphs"][0]["annotations"][0]["value"]) def test_pdf_with_only_mp_table(self) -> None: file_name = os.path.join("..", "tables", "multipage_table.pdf") result = self._send_request(file_name, dict(pdf_with_text_layer="true", need_header_footer_analysis=True)) - table_refs = [ann["value"] for ann in result["content"]["structure"]["subparagraphs"][0]["annotations"] - if ann["name"] == "table"] + table_refs = [ann["value"] for ann in result["content"]["structure"]["subparagraphs"][0]["annotations"] if ann["name"] == "table"] self.assertTrue(len(result["content"]["tables"]), len(table_refs)) for table in result["content"]["tables"]: diff --git a/tests/api_tests/test_api_format_pptx.py b/tests/api_tests/test_api_format_pptx.py index 14d0921a..2996d6d3 100644 --- a/tests/api_tests/test_api_format_pptx.py +++ b/tests/api_tests/test_api_format_pptx.py @@ -10,31 +10,31 @@ class TestApiPPTXReader(AbstractTestApiDocReader): def test_pptx(self) -> None: file_name = "example.pptx" result = self._send_request(file_name, data=dict(structure_type="linear")) - self.__check_content(result['content']) + self.__check_content(result["content"]) def test_ppt(self) -> None: file_name = "example.ppt" result = self._send_request(file_name, data=dict(structure_type="linear")) - self.__check_content(result['content']) + self.__check_content(result["content"]) def test_odp(self) -> None: file_name = "example.odp" result = self._send_request(file_name, data=dict(structure_type="linear")) - self.__check_content(result['content']) + self.__check_content(result["content"]) def __check_content(self, content: dict) -> None: - subparagraphs = content['structure']['subparagraphs'] - self.assertEqual('A long time ago in a galaxy far far away ', subparagraphs[0]['text']) - self.assertEqual('Example', subparagraphs[1]['text']) - self.assertEqual('Some author', subparagraphs[2]['text']) - self.assertEqual('This is simple table', subparagraphs[3]['text']) - - table = content['tables'][0]['cells'] - self.assertEqual('', table[0][0]) - self.assertEqual('Header1', table[0][1]) - self.assertEqual('Header2', table[0][2]) - self.assertEqual('Header3', table[0][3]) - self.assertEqual('Some content', table[1][0]) - self.assertEqual('A', table[1][1]) - self.assertEqual('B', table[1][2]) - self.assertEqual('C', table[1][3]) + subparagraphs = content["structure"]["subparagraphs"] + self.assertEqual("A long time ago in a galaxy far far away ", subparagraphs[0]["text"]) + self.assertEqual("Example", subparagraphs[1]["text"]) + self.assertEqual("Some author", subparagraphs[2]["text"]) + self.assertEqual("This is simple table", subparagraphs[3]["text"]) + + table = content["tables"][0]["cells"] + self.assertEqual("", table[0][0]) + self.assertEqual("Header1", table[0][1]) + self.assertEqual("Header2", table[0][2]) + self.assertEqual("Header3", table[0][3]) + self.assertEqual("Some content", table[1][0]) + self.assertEqual("A", table[1][1]) + self.assertEqual("B", table[1][2]) + self.assertEqual("C", table[1][3]) diff --git a/tests/api_tests/test_api_format_txt.py b/tests/api_tests/test_api_format_txt.py index f56c4842..3be5b0e4 100644 --- a/tests/api_tests/test_api_format_txt.py +++ b/tests/api_tests/test_api_format_txt.py @@ -16,33 +16,32 @@ def test_text(self) -> None: file_name = "example.txt" result = self._send_request(file_name, data={"structure_type": "tree"}) content = result["content"]["structure"] - self.assertEqual(content["subparagraphs"][0]["text"].rstrip(), 'Пример документа') + self.assertEqual(content["subparagraphs"][0]["text"].rstrip(), "Пример документа") - self._check_metainfo(result['metadata'], 'text/plain', file_name) + self._check_metainfo(result["metadata"], "text/plain", file_name) def test_text_pretty_json(self) -> None: file_name = "example.txt" result = self._send_request(file_name, data={"structure_type": "tree", "return_format": "pretty_json"}) content = result["content"]["structure"] - self.assertEqual(content["subparagraphs"][0]["text"].rstrip(), 'Пример документа') + self.assertEqual(content["subparagraphs"][0]["text"].rstrip(), "Пример документа") - self._check_metainfo(result['metadata'], 'text/plain', file_name) + self._check_metainfo(result["metadata"], "text/plain", file_name) def test_text_bad_return_format(self) -> None: file_name = "example.txt" result = self._send_request(file_name, data={"structure_type": "tree", "return_format": "broken"}) content = result["content"]["structure"] - self.assertEqual(content["subparagraphs"][0]["text"].rstrip(), 'Пример документа') + self.assertEqual(content["subparagraphs"][0]["text"].rstrip(), "Пример документа") - self._check_metainfo(result['metadata'], 'text/plain', file_name) + self._check_metainfo(result["metadata"], "text/plain", file_name) def test_text2(self) -> None: file_name = "pr_17.txt" result = self._send_request(file_name, data={"structure_type": "tree"}) content = result["content"]["structure"] self.assertIn("УТВЕРЖДЕНЫ", get_by_tree_path(content, "0.0")["text"]) - self.assertIn("1. Настоящие Требования разработаны в соответствии с Федеральным законом", - get_by_tree_path(content, "0.1.0")["text"]) + self.assertIn("1. Настоящие Требования разработаны в соответствии с Федеральным законом", get_by_tree_path(content, "0.1.0")["text"]) def test_special_symbols(self) -> None: file_name = "special_symbol.txt" @@ -69,7 +68,7 @@ def test_large_file(self) -> None: content = result["content"]["structure"]["subparagraphs"][0]["text"] for line_id, line in enumerate(content.split("\n")): if line.strip() != "": - self.assertEqual("Line number {:09d}".format(line_id), line) + self.assertEqual(f"Line number {line_id:09d}", line) def test_txt_with_law(self) -> None: file_name = "17 (1).txt" @@ -97,7 +96,7 @@ def test_cp1251(self) -> None: def __check_content(self, result: dict, encoding: str) -> None: warning = result["warnings"][0] - self.assertEqual(warning, "encoding is {}".format(encoding)) + self.assertEqual(warning, f"encoding is {encoding}") path = self._get_abs_path("utf8.txt") with open(path) as file: text = file.read() @@ -109,17 +108,17 @@ def __check_football(self, content: dict) -> None: self.assertTrue(node["text"].startswith(" Association football, more commonly known as simply")) self.assertTrue(node["text"].endswith("The team with the higher number of goals wins the game.\n\n")) annotations = node["annotations"] - self.assertIn({'name': 'spacing', 'value': '50', 'start': 0, 'end': 546}, annotations) + self.assertIn({"name": "spacing", "value": "50", "start": 0, "end": 546}, annotations) node = content[1] self.assertTrue(node["text"].startswith(" Football is played in accordance with a set of rules known")) self.assertTrue(node["text"].strip().endswith("the coin toss prior to kick-off or penalty kicks.")) annotations = node["annotations"] - self.assertIn({'name': 'spacing', 'value': '100', 'start': 0, 'end': 163}, annotations) + self.assertIn({"name": "spacing", "value": "100", "start": 0, "end": 163}, annotations) node = content[2] self.assertTrue(node["text"].startswith(" Football is governed internationally by the International")) self.assertTrue(node["text"].endswith("the 2019 FIFA Women's World Cup in France.\n\n")) annotations = node["annotations"] - self.assertIn({'name': 'spacing', 'value': '400', 'start': 0, 'end': 164}, annotations) - self.assertIn({'name': 'spacing', 'value': '50', 'start': 164, 'end': 1068}, annotations) + self.assertIn({"name": "spacing", "value": "400", "start": 0, "end": 164}, annotations) + self.assertIn({"name": "spacing", "value": "50", "start": 164, "end": 1068}, annotations) self.assertTrue(content[3]["text"].startswith(" The most prestigious competitions in European club")) self.assertTrue(content[3]["text"].endswith("cost in excess of £600 million/€763 million/US$1.185 billion.\n")) diff --git a/tests/api_tests/test_api_format_xml.py b/tests/api_tests/test_api_format_xml.py index bbe80a1f..5810bfdc 100644 --- a/tests/api_tests/test_api_format_xml.py +++ b/tests/api_tests/test_api_format_xml.py @@ -12,6 +12,6 @@ def test_xml(self) -> None: result = self._send_request(file_name, data={"structure_type": "linear"}) subparagraphs = result["content"]["structure"]["subparagraphs"] self.assertEqual('\n', subparagraphs[0]["text"]) - self.assertEqual('\n', subparagraphs[1]["text"]) - self.assertEqual(' Tove\n', subparagraphs[2]["text"]) - self.assertEqual('', subparagraphs[3]["text"]) + self.assertEqual("\n", subparagraphs[1]["text"]) + self.assertEqual(" Tove\n", subparagraphs[2]["text"]) + self.assertEqual("", subparagraphs[3]["text"]) diff --git a/tests/api_tests/test_api_misc_main.py b/tests/api_tests/test_api_misc_main.py index 1527c915..cbc47976 100644 --- a/tests/api_tests/test_api_misc_main.py +++ b/tests/api_tests/test_api_misc_main.py @@ -1,5 +1,6 @@ import json import os + import requests from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -26,7 +27,7 @@ def test_send_wo_file(self) -> None: def test_version(self) -> None: version = self.__get_version() - r = requests.get("http://{host}:{port}/version".format(host=self._get_host(), port=self._get_port())) + r = requests.get(f"http://{self._get_host()}:{self._get_port()}/version") self.assertEqual(version, r.text.strip()) def test_version_parsed_file(self) -> None: @@ -38,7 +39,7 @@ def test_text(self) -> None: file_name = "example.txt" result = self._send_request(os.path.join("txt", file_name), data=dict(structure_type="tree")) content = result["content"]["structure"] - self.assertEqual(content["subparagraphs"][0]["text"].strip(), 'Пример документа') - self.assertEqual(content["subparagraphs"][1]["subparagraphs"][0]["text"].strip(), '1. Элемент нумерованного списка') - self.assertEqual(content["subparagraphs"][1]["subparagraphs"][0]["metadata"]['paragraph_type'], 'list_item') - self._check_metainfo(result['metadata'], 'text/plain', file_name) + self.assertEqual(content["subparagraphs"][0]["text"].strip(), "Пример документа") + self.assertEqual(content["subparagraphs"][1]["subparagraphs"][0]["text"].strip(), "1. Элемент нумерованного списка") + self.assertEqual(content["subparagraphs"][1]["subparagraphs"][0]["metadata"]["paragraph_type"], "list_item") + self._check_metainfo(result["metadata"], "text/plain", file_name) diff --git a/tests/api_tests/test_api_misc_multipage_table.py b/tests/api_tests/test_api_misc_multipage_table.py index 5531b265..67784ec1 100644 --- a/tests/api_tests/test_api_misc_multipage_table.py +++ b/tests/api_tests/test_api_misc_multipage_table.py @@ -11,9 +11,9 @@ def _get_abs_path(self, file_name: str) -> str: def _get_tables(self, file_name: str) -> List[dict]: result = self._send_request(file_name, {"pdf_with_text_layer": "false"}) - content = result['content'] + content = result["content"] self._test_table_refs(content=content) - tables = content['tables'] + tables = content["tables"] tree = content["structure"] self._check_tree_sanity(tree=tree) return tables @@ -49,20 +49,16 @@ def test_api_ml_table_recognition_synthetic_data_3(self) -> None: self.assertEqual(len(tables), 1) table = tables[0] rows = table["cells"] - self.assertListEqual(["Заголовок\nБольшой", "Еще один большой заголовок", "Еще один большой заголовок", - "Еще один большой заголовок", "Еще один большой заголовок"], rows[0]) - self.assertListEqual(["Заголовок\nБольшой", "Заголовок поменьше 1", "Заголовок поменьше 1", - "Заголовок поменьше 2", "Заголовок поменьше 2"], rows[1]) - self.assertListEqual(["Заголовок\nБольшой", "Заголовочек 1", "Заголовочек 2", - "Заголовочек 3", "Заголовочек 4"], rows[2]) + self.assertListEqual(["Заголовок\nБольшой", "Еще один большой заголовок", "Еще один большой заголовок", "Еще один большой заголовок", + "Еще один большой заголовок"], rows[0]) + self.assertListEqual(["Заголовок\nБольшой", "Заголовок поменьше 1", "Заголовок поменьше 1", "Заголовок поменьше 2", "Заголовок поменьше 2"], rows[1]) + self.assertListEqual(["Заголовок\nБольшой", "Заголовочек 1", "Заголовочек 2", "Заголовочек 3", "Заголовочек 4"], rows[2]) self.assertListEqual(["Данные 1", "Данные 1", "Данные 1", "Данные 1", "Данные 1"], rows[3]) self.assertListEqual(["Данные 2", "Данные 2", "Данные 2", "Данные 2", "Данные 2"], rows[4]) self.assertListEqual(["Данные 3", "Данные 3", "Данные 3", "Данные 3", "Данные 3"], rows[5]) self.assertListEqual(["Данные 4", "Данные 4", "Данные 4", "Данные 4", "Данные 4"], rows[6]) self.assertListEqual(["Данные 5", "Данные 5", "Данные 5", "Данные 5", "Данные 5"], rows[7]) - self.assertListEqual(["Заголовок\nБольшой", "Заголовок поменьше 1", "Заголовок поменьше 1", - "Заголовок поменьше 2", "Заголовок поменьше 2"], rows[8]) - self.assertListEqual(["Заголовок\nБольшой", "Заголовочек 1", "Заголовочек 2", - "Заголовочек 3", "Заголовочек 4"], rows[9]) + self.assertListEqual(["Заголовок\nБольшой", "Заголовок поменьше 1", "Заголовок поменьше 1", "Заголовок поменьше 2", "Заголовок поменьше 2"], rows[8]) + self.assertListEqual(["Заголовок\nБольшой", "Заголовочек 1", "Заголовочек 2", "Заголовочек 3", "Заголовочек 4"], rows[9]) self.assertListEqual(["Данные 6", "Данные 6", "Данные 6", "Данные 6", "Данные 6"], rows[10]) self.assertListEqual(["Данные 7", "Данные 7", "Данные 7", "Данные 7", "Данные 7"], rows[11]) diff --git a/tests/api_tests/test_api_misc_with_attachments.py b/tests/api_tests/test_api_misc_with_attachments.py index 479e2bb2..0f53199a 100644 --- a/tests/api_tests/test_api_misc_with_attachments.py +++ b/tests/api_tests/test_api_misc_with_attachments.py @@ -18,18 +18,18 @@ def _check_attachments(self, attachments: List[dict]) -> None: def test_wo_attachments_excel(self) -> None: file_name = "xlsx/example.xlsx" result = self._send_request(file_name, dict(with_attachments=True)) - self.assertEqual([], result['attachments']) + self.assertEqual([], result["attachments"]) def test_get_attachments_xlxs_depth_1(self) -> None: file_name = "xlsx/example_with_images.xlsx" result = self._send_request(file_name, dict(with_attachments=True)) - attachments = result['attachments'] + attachments = result["attachments"] self._check_attachments(attachments) def test_get_attachments_xls_depth_1(self) -> None: file_name = "xlsx/example_with_images.xls" result = self._send_request(file_name, dict(with_attachments=True)) - attachments = result['attachments'] + attachments = result["attachments"] self._check_attachments(attachments) def test_get_attachments_pdf_depth_1(self) -> None: @@ -44,7 +44,7 @@ def test_get_attachments_pdf_depth_1(self) -> None: self.assertEqual(attachments[3]["metadata"]["file_name"], "attachment.txt") self.assertEqual(attachments[4]["metadata"]["file_type"], "application/json") - def test_attachments_pmi_document(self): + def test_attachments_pmi_document(self) -> None: file_name = "pdf_with_text_layer/Document635.pdf" result = self._send_request(file_name, dict(with_attachments=True, pdf_with_text_layer="tabby")) @@ -69,14 +69,13 @@ def test_need_content_analysis(self) -> None: def test_get_without_attachments(self) -> None: file_name = "with_attachments/example_with_attachments_depth_1.pdf" result = self._send_request(file_name, dict(with_attachments=False)) - self.assertEqual([], result['attachments']) + self.assertEqual([], result["attachments"]) def test_json_attachments(self) -> None: - file_name = 'json/with_html.json' + file_name = "json/with_html.json" parameters = dict() parameters["with_attachments"] = True - parameters["html_fields"] = json.dumps( - [["title"], ["body"], ["example"], ["deep_key1", "deep_key2", "deep_key3"]]) + parameters["html_fields"] = json.dumps([["title"], ["body"], ["example"], ["deep_key1", "deep_key2", "deep_key3"]]) result = self._send_request(file_name, parameters) attachments = result["attachments"] @@ -84,13 +83,11 @@ def test_json_attachments(self) -> None: self.assertEqual(len(attachments), 4) def test_json_invalid_html_fields(self) -> None: - file_name = 'json/with_html.json' + file_name = "json/with_html.json" parameters = dict() parameters["with_attachments"] = True - parameters["html_fields"] = json.dumps( - [["title"], ["example"], ["another_field"], ["test"], ["lists"], - ["log"], ["text"], ["deep_key1", "deep_key2", "deep_key3"]] - ) + parameters["html_fields"] = json.dumps([["title"], ["example"], ["another_field"], ["test"], ["lists"], ["log"], ["text"], + ["deep_key1", "deep_key2", "deep_key3"]]) result = self._send_request(file_name, parameters) attachments = result["attachments"] @@ -98,12 +95,8 @@ def test_json_invalid_html_fields(self) -> None: self.assertEqual(len(attachments), 4) def test_json_with_html_fields_with_scripts(self) -> None: - file_name = 'json/example2.json' - parameters = dict( - with_attachments=True, - html_fields=json.dumps([["text"]]), - need_content_analysis=True - ) + file_name = "json/example2.json" + parameters = dict(with_attachments=True, html_fields=json.dumps([["text"]]), need_content_analysis=True) result = self._send_request(file_name, parameters) attachments = result["attachments"] @@ -119,10 +112,7 @@ def test_json_with_bad_style_in_html(self) -> None: file_name = "json/0001-p1.json" parameters = dict() parameters["with_attachments"] = True - parameters["html_fields"] = json.dumps( - [["news_link"], ["publication_title"], ["publication_date"], - ["publication_author"], ["text_publication"]] - ) + parameters["html_fields"] = json.dumps([["news_link"], ["publication_title"], ["publication_date"], ["publication_author"], ["text_publication"]]) result = self._send_request(file_name, parameters) attachments = result["attachments"] @@ -130,26 +120,23 @@ def test_json_with_bad_style_in_html(self) -> None: self.assertEqual(len(attachments), 5) def test_docx_attachments(self) -> None: - file_name = 'with_attachments/with_attachments_0.docx' + file_name = "with_attachments/with_attachments_0.docx" result = self._send_request(file_name, dict(with_attachments=True, need_content_analysis=True)) - attachments = result['attachments'] - names = [attachment['metadata']['file_name'] for attachment in attachments] - self.assertIn('arch_with_attachs.zip', names) - self.assertIn('VVP_global_table.pdf', names) - self.assertIn('lorem.txt', names) - self.assertIn('books.csv', names) + attachments = result["attachments"] + names = [attachment["metadata"]["file_name"] for attachment in attachments] + self.assertIn("arch_with_attachs.zip", names) + self.assertIn("VVP_global_table.pdf", names) + self.assertIn("lorem.txt", names) + self.assertIn("books.csv", names) - arch = [attachment for attachment in attachments - if attachment['metadata']['file_name'] == 'arch_with_attachs.zip'][0] - self.assertEqual(len(arch['attachments']), 4) + arch = [attachment for attachment in attachments if attachment["metadata"]["file_name"] == "arch_with_attachs.zip"][0] + self.assertEqual(len(arch["attachments"]), 4) - txt = [attachment for attachment in attachments - if attachment['metadata']['file_name'] == 'lorem.txt'][0] + txt = [attachment for attachment in attachments if attachment["metadata"]["file_name"] == "lorem.txt"][0] - self.assertIn("Adipisicing est non minim aute reprehenderit incididunt magna" - " ad consectetur ad occaecat anim voluptate culpa fugiat", - txt['content']['structure']['subparagraphs'][0]['text'], ) + self.assertIn("Adipisicing est non minim aute reprehenderit incididunt magna ad consectetur ad occaecat anim voluptate culpa fugiat", + txt["content"]["structure"]["subparagraphs"][0]["text"], ) def test_docx_images_base64(self) -> None: metadata = self.__check_base64(True) diff --git a/tests/api_tests/test_api_misc_with_images_refs.py b/tests/api_tests/test_api_misc_with_images_refs.py index 774b6a10..d1b90a4a 100644 --- a/tests/api_tests/test_api_misc_with_images_refs.py +++ b/tests/api_tests/test_api_misc_with_images_refs.py @@ -14,19 +14,19 @@ def test_docx_with_images(self) -> None: content = result["content"]["structure"] image_paragraph = content["subparagraphs"][0] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.png']) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image1.png"]) image_paragraph = content["subparagraphs"][2] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg']) - self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.jpeg']) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image2.jpeg"]) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image3.jpeg"]) image_paragraph = content["subparagraphs"][5] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image4.jpeg']) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image4.jpeg"]) image_paragraph = content["subparagraphs"][6] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image5.jpeg']) - self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image6.jpeg']) - self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image7.jpeg']) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image5.jpeg"]) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image6.jpeg"]) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image7.jpeg"]) def test_odt_with_images(self) -> None: file_name = "odt_with_images.odt" @@ -35,13 +35,13 @@ def test_odt_with_images(self) -> None: content = result["content"]["structure"] image_paragraph = content["subparagraphs"][0] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.jpeg']) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image1.jpeg"]) image_paragraph = content["subparagraphs"][7] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg']) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image2.jpeg"]) image_paragraph = content["subparagraphs"][8] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.jpeg']) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image3.jpeg"]) def test_docx_with_images_from_mac(self) -> None: file_name = "doc_with_images.docx" @@ -50,15 +50,15 @@ def test_docx_with_images_from_mac(self) -> None: content = result["content"]["structure"] image_paragraph = content["subparagraphs"][2] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.jpeg']) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image1.jpeg"]) image_paragraph = content["subparagraphs"][3] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg']) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image2.jpeg"]) image_paragraph = content["subparagraphs"][5] - self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.png']) + self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image3.png"]) def __check_image_paragraph(self, image_paragraph: dict, image_uid: str) -> None: text = image_paragraph["text"] image_annotations = image_paragraph["annotations"] - self.assertIn({'start': 0, 'end': len(text), 'name': 'attachment', 'value': image_uid}, image_annotations) + self.assertIn({"start": 0, "end": len(text), "name": "attachment", "value": image_uid}, image_annotations) diff --git a/tests/api_tests/test_api_module_table_recognizer.py b/tests/api_tests/test_api_module_table_recognizer.py index d3d9b8c5..5de2c9ce 100644 --- a/tests/api_tests/test_api_module_table_recognizer.py +++ b/tests/api_tests/test_api_module_table_recognizer.py @@ -14,46 +14,45 @@ def _get_abs_path(self, file_name: str) -> str: def test_api_table_recognition_3(self) -> None: file_name = "example_with_table16.jpg" res = self._send_request(file_name) - table = res['content']['tables'][0] - self._check_similarity(table['cells'][0][1], "Наименование участкового лестничества") - self._check_similarity(table['cells'][2][1], "Итого") - self._check_similarity(table['cells'][13][0], "Выращивание лесных, плодовых, ягодных, " - "декоративных растений, лекарственных растений") - self._check_similarity(table['cells'][13][3], "1272100,0") + table = res["content"]["tables"][0] + self._check_similarity(table["cells"][0][1], "Наименование участкового лестничества") + self._check_similarity(table["cells"][2][1], "Итого") + self._check_similarity(table["cells"][13][0], "Выращивание лесных, плодовых, ягодных, декоративных растений, лекарственных растений") + self._check_similarity(table["cells"][13][3], "1272100,0") def test_api_table_recognition_4(self) -> None: file_name = "example_with_table17.jpg" - table = self._send_request(file_name)['content']['tables'][0] - self._check_similarity(table['cells'][0][1], "Наименование\nучасткового\nлестничества") - self._check_similarity(table['cells'][0][2], 'Неречень кварталов или их частей') - self._check_similarity(table['cells'][3][3], '801 976,3') + table = self._send_request(file_name)["content"]["tables"][0] + self._check_similarity(table["cells"][0][1], "Наименование\nучасткового\nлестничества") + self._check_similarity(table["cells"][0][2], "Неречень кварталов или их частей") + self._check_similarity(table["cells"][3][3], "801 976,3") def test_api_table_recognition_horizontal_union_1(self) -> None: - file_name = 'example_with_table_horizontal_union.jpg' - table = self._send_request(file_name)['content']['tables'][0] + file_name = "example_with_table_horizontal_union.jpg" + table = self._send_request(file_name)["content"]["tables"][0] - self._check_similarity(table['cells'][0][1], "Наименование позиции") - self._check_similarity(table['cells'][1][1], "Наименование позиции") - self._check_similarity(table['cells'][0][2], "Начальная (максимальная) цена за единицу\nпродукции") - self._check_similarity(table['cells'][1][2], "рублей, включая НДС\n(20%)") - self._check_similarity(table['cells'][0][3], "Начальная (максимальная) цена за единицу\nпродукции") - self._check_similarity(table['cells'][1][3], "рублей, без учета НДС\n(20%)") + self._check_similarity(table["cells"][0][1], "Наименование позиции") + self._check_similarity(table["cells"][1][1], "Наименование позиции") + self._check_similarity(table["cells"][0][2], "Начальная (максимальная) цена за единицу\nпродукции") + self._check_similarity(table["cells"][1][2], "рублей, включая НДС\n(20%)") + self._check_similarity(table["cells"][0][3], "Начальная (максимальная) цена за единицу\nпродукции") + self._check_similarity(table["cells"][1][3], "рублей, без учета НДС\n(20%)") def test_api_table_recognition_hor_and_vert_union_2(self) -> None: file_name = "example_with_table_hor_vert_union.png" - table = self._send_request(file_name, data={"language": "rus"})['content']['tables'][0] + table = self._send_request(file_name, data={"language": "rus"})["content"]["tables"][0] - self._check_similarity(table['cells'][0][6], "Стоимость единицы, руб.") - self._check_similarity(table['cells'][1][6], "В Tоm числе") - self._check_similarity(table['cells'][2][6], "Осн.З/п") + self._check_similarity(table["cells"][0][6], "Стоимость единицы, руб.") + self._check_similarity(table["cells"][1][6], "В Tоm числе") + self._check_similarity(table["cells"][2][6], "Осн.З/п") - self._check_similarity(table['cells'][0][10], "Общая стоимость, руб.") - self._check_similarity(table['cells'][1][10], "Всего") - self._check_similarity(table['cells'][2][10], "Всего") + self._check_similarity(table["cells"][0][10], "Общая стоимость, руб.") + self._check_similarity(table["cells"][1][10], "Всего") + self._check_similarity(table["cells"][2][10], "Всего") - self._check_similarity(table['cells'][0][12], "Общая стоимость, руб.") - self._check_similarity(table['cells'][1][12], "В том числе") - self._check_similarity(table['cells'][2][12], "Эк.Маш") + self._check_similarity(table["cells"][0][12], "Общая стоимость, руб.") + self._check_similarity(table["cells"][1][12], "В том числе") + self._check_similarity(table["cells"][2][12], "Эк.Маш") def _check_header_table(self, cells: List[List[str]]) -> None: self._check_similarity(cells[0][0], "№\nп/п", threshold=0.5) @@ -87,16 +86,16 @@ def _check_header_table(self, cells: List[List[str]]) -> None: def test_api_table_recognition_with_diff_orient_cells_90(self) -> None: file_name = "example_table_with_90_orient_cells.pdf" response = self._send_request(file_name, dict(orient_analysis_cells=True, orient_cell_angle="90")) - table = response['content']['tables'][0] + table = response["content"]["tables"][0] - self._check_header_table(table['cells']) + self._check_header_table(table["cells"]) @unittest.skip def test_api_table_recognition_with_diff_orient_cells_270(self) -> None: file_name = "example_table_with_270_orient_cells.pdf" response = self._send_request(file_name, dict(orient_analysis_cells=True, orient_cell_angle="270")) - table = response['content']['tables'][0] - self._check_header_table(table['cells']) + table = response["content"]["tables"][0] + self._check_header_table(table["cells"]) def test_pdf_table(self) -> None: file_name = "example_with_table1.pdf" @@ -114,13 +113,12 @@ def test_pdf_table(self) -> None: self.assertEqual("1", rows[1][0]) self.assertEqual('ООО "Айтехникс"', rows[1][1]) - self.assertEqual("Емельяновский район, МО\nСолонцовский сельсовет, площадка\nЗападная, 2a cr3", - rows[1][2]) + self.assertEqual("Емельяновский район, МО\nСолонцовский сельсовет, площадка\nЗападная, 2a cr3", rows[1][2]) self.assertEqual("Наталья Медведева\n8-908-215-75-05", rows[1][3]) self.assertEqual("6", rows[6][0]) self.assertEqual('ООО "Скай-\nтехнолоджи"', rows[6][1]) - self.assertEqual('Пр. Свободный 75', rows[6][2]) + self.assertEqual("Пр. Свободный 75", rows[6][2]) self.assertEqual("Андрей Горбунов\n8-913-560-50-09", rows[6][3]) def test_rectangular(self) -> None: @@ -130,9 +128,9 @@ def test_rectangular(self) -> None: self.assertEqual(1, len(tables)) table = tables[0] cells = table["cells"] - self.assertListEqual(['Фамилия', 'Имя', 'Отчество'], cells[0]) - self.assertListEqual(['Иванов', 'Иван', 'Иванович'], cells[1]) - self.assertListEqual(['Петров', 'Пётр', 'Петрович'], cells[2]) + self.assertListEqual(["Фамилия", "Имя", "Отчество"], cells[0]) + self.assertListEqual(["Иванов", "Иван", "Иванович"], cells[1]) + self.assertListEqual(["Петров", "Пётр", "Петрович"], cells[2]) def test_merged_vertical(self) -> None: file_name = "merged_vertical.pdf" @@ -142,9 +140,9 @@ def test_merged_vertical(self) -> None: table = tables[0] cells = table["cells"] - self.assertListEqual(['Фамилия\nИванов\nПетров', 'Имя', 'Отчество'], cells[0]) - self.assertListEqual(['Фамилия\nИванов\nПетров', 'Иван', 'Иванович'], cells[1]) - self.assertListEqual(['Фамилия\nИванов\nПетров', 'Пётр', 'Петрович'], cells[2]) + self.assertListEqual(["Фамилия\nИванов\nПетров", "Имя", "Отчество"], cells[0]) + self.assertListEqual(["Фамилия\nИванов\nПетров", "Иван", "Иванович"], cells[1]) + self.assertListEqual(["Фамилия\nИванов\nПетров", "Пётр", "Петрович"], cells[2]) def test_merged_horizontal(self) -> None: file_name = "merged_horizontal.pdf" @@ -154,9 +152,9 @@ def test_merged_horizontal(self) -> None: table = tables[0] cells = table["cells"] - self.assertListEqual(['Фамилия Имя Отчество', 'Фамилия Имя Отчество', 'Фамилия Имя Отчество'], cells[0]) - self.assertListEqual(['Иванов', 'Иван', 'Иванович'], cells[1]) - self.assertListEqual(['Петров', 'Пётр', 'Петрович'], cells[2]) + self.assertListEqual(["Фамилия Имя Отчество", "Фамилия Имя Отчество", "Фамилия Имя Отчество"], cells[0]) + self.assertListEqual(["Иванов", "Иван", "Иванович"], cells[1]) + self.assertListEqual(["Петров", "Пётр", "Петрович"], cells[2]) def test_tables_annotations(self) -> None: file_name = "two_column_document.pdf" diff --git a/tests/api_tests/test_api_pdf_auto_text_layer.py b/tests/api_tests/test_api_pdf_auto_text_layer.py new file mode 100644 index 00000000..5c56b2c3 --- /dev/null +++ b/tests/api_tests/test_api_pdf_auto_text_layer.py @@ -0,0 +1,90 @@ +import os + +from tests.api_tests.abstract_api_test import AbstractTestApiDocReader + + +class TestApiPdfAutoTextLayer(AbstractTestApiDocReader): + + def _get_abs_path(self, file_name: str) -> str: + return os.path.join(self.data_directory_path, "pdf_auto", file_name) + + def test_pdf_auto_auto_columns(self) -> None: + file_name = "0004057v1.pdf" + parameters = dict(with_attachments=True, pdf_with_text_layer="auto", is_one_column_document="auto") + result = self._send_request(file_name, parameters) + warnings = result["warnings"] + self.assertIn("assume page 0 has 2 columns", warnings) + self.assertIn("assume page 1 has 2 columns", warnings) + self.assertIn("assume document has correct text layer", warnings) + + def test_pdf_auto_auto_columns_each_page_have_different_columns(self) -> None: + file_name = "liao2020_merged_organized.pdf" + parameters = dict(with_attachments=True, pdf_with_text_layer="auto", is_one_column_document="auto") + result = self._send_request(file_name, parameters) + warnings = result["warnings"] + self.assertIn("assume page 0 has 1 columns", warnings) + self.assertIn("assume page 1 has 2 columns", warnings) + self.assertIn("assume page 2 has 1 columns", warnings) + self.assertIn("assume page 3 has 2 columns", warnings) + self.assertIn("assume document has correct text layer", warnings) + + def test_pdf_auto_auto_columns_each_page_have_same_columns_except_first(self) -> None: + file_name = "liao2020_merged-1-5.pdf" + parameters = dict(with_attachments=True, pdf_with_text_layer="auto", is_one_column_document="auto") + result = self._send_request(file_name, parameters) + warnings = result["warnings"] + self.assertIn("assume page 0 has 1 columns", warnings) + self.assertIn("assume page 1 has 2 columns", warnings) + self.assertIn("assume page 2 has 2 columns", warnings) + self.assertIn("assume page 3 has 2 columns", warnings) + self.assertIn("assume page 4 has 2 columns", warnings) + self.assertIn("assume document has correct text layer", warnings) + + def test_pdf_auto_text_layer_2(self) -> None: + file_name = "e09d__cs-pspc-xg-15p-portable-radio-quick-guide.pdf" + self._send_request(file_name, dict(with_attachments=True, pdf_with_text_layer="auto")) + + def test_auto_pdf_with_scans(self) -> None: + file_name = "tz_scan_1page.pdf" + result = self._send_request(file_name, dict(pdf_with_text_layer="auto")) + tree = result["content"]["structure"] + self._check_tree_sanity(tree) + self.assertIn("Техническое задание", self._get_by_tree_path(tree, "0.0")["text"]) + + def test_auto_pdf_with_text_layer(self) -> None: + file_name = os.path.join("..", "pdf_with_text_layer", "english_doc.pdf") + result = self._send_request(file_name, dict(pdf_with_text_layer="auto")) + self.assertIn("assume document has correct text layer", result["warnings"]) + self.check_english_doc(result) + + def test_auto_pdf_with_wrong_text_layer(self) -> None: + file_name = "english_doc_bad_text.pdf" + result = self._send_request(file_name, dict(pdf_with_text_layer="auto")) + self.assertIn("assume document has incorrect text layer", result["warnings"]) + self.check_english_doc(result) + + def test_auto_document_mixed(self) -> None: + file_name = "mixed_pdf.pdf" + for pdf_with_text_layer in "auto", "auto_tabby": + result = self._send_request(file_name, dict(pdf_with_text_layer=pdf_with_text_layer)) + self.assertIn("assume document has correct text layer", result["warnings"]) + self.assertIn("assume first page has no text layer", result["warnings"]) + self.check_english_doc(result) + structure = result["content"]["structure"] + list_items = structure["subparagraphs"][1]["subparagraphs"] + self.assertEqual("3) продолжаем список\n", list_items[2]["text"]) + self.assertEqual("4) Список идёт своим чередом\n", list_items[3]["text"]) + self.assertEqual("5) заканчиваем список\n", list_items[4]["text"]) + self.assertEqual("6) последний элемент списка.\n", list_items[5]["text"]) + + def test_auto_partially_read(self) -> None: + file_name = "mixed_pdf.pdf" + data = {"pdf_with_text_layer": "auto", "pages": "2:"} + result = self._send_request(file_name, data) + structure = result["content"]["structure"] + self.assertEqual("", structure["subparagraphs"][0]["text"]) + list_items = structure["subparagraphs"][1]["subparagraphs"] + self.assertEqual("3) продолжаем список\n", list_items[0]["text"]) + self.assertEqual("4) Список идёт своим чередом\n", list_items[1]["text"]) + self.assertEqual("5) заканчиваем список\n", list_items[2]["text"]) + self.assertEqual("6) последний элемент списка.\n", list_items[3]["text"]) diff --git a/tests/api_tests/test_api_pdf_page_limit.py b/tests/api_tests/test_api_pdf_page_limit.py new file mode 100644 index 00000000..01be3de1 --- /dev/null +++ b/tests/api_tests/test_api_pdf_page_limit.py @@ -0,0 +1,70 @@ +import os + +from tests.api_tests.abstract_api_test import AbstractTestApiDocReader + + +class TestApiPdfReader(AbstractTestApiDocReader): + + def _get_abs_path(self, file_name: str) -> str: + return os.path.join(self.data_directory_path, "pdf_with_text_layer", file_name) + + lines = ["Первая страница", + "Вторая страница", + "Третья страница", + "Четвёртая страница", + "Пятая страница", + "Шестая страница", + "Седьмая страница", + "Восьмая страница", + "Девятая страница"] + + def test_no_text_layer(self) -> None: + self.__check_limit("false", check_partially=True) + self.__check_out_of_limit("false") + + def test_text_layer(self) -> None: + self.__check_limit("true", check_partially=True) + self.__check_out_of_limit("true") + + def test_auto_text_layer(self) -> None: + self.__check_limit("auto", check_partially=True) + self.__check_out_of_limit("auto") + + def test_tabby_layer(self) -> None: + self.__check_limit("tabby", check_partially=True) + + def test_auto_tabby(self) -> None: + self.__check_limit("auto_tabby", check_partially=True) + self.__check_out_of_limit("auto_tabby") + + def __check_out_of_limit(self, reader: str) -> None: + text_expected = "" + for pages in ("10:11", ): + self.__check(pages, text_expected, reader=reader) + + def __check_limit(self, reader: str, check_partially: bool = False) -> None: + text_expected = "" + self.__check("2:1", text_expected, reader=reader, check_partially=check_partially) + + text_expected = "\n".join(self.lines[:]) + for pages in "", ":", "0:9", "0:20", ":9", "0:": + self.__check(pages, text_expected, reader=reader) + + text_expected = "\n".join(self.lines[0:2]) + self.__check("1:2", text_expected, reader=reader, check_partially=check_partially) + + text_expected = "\n".join(self.lines[0:9]) + self.__check("1:9", text_expected, reader=reader, check_partially=False) + + def __check(self, pages: str, text_expected: str, reader: str, check_partially: bool = False) -> None: + + params = dict(pdf_with_text_layer=reader, pages=pages, is_one_column_document="true") + result = self._send_request("multipage.pdf", params) + if check_partially: + self.assertIn("The document is partially parsed", result["warnings"]) + self.assertIn('first_page', result['metadata']) + self.assertIn('last_page', result['metadata']) + tree = result["content"]["structure"] + node = self._get_by_tree_path(tree, "0.0") + text = node["text"].strip() + self.assertEqual(text_expected, text, "{} and {}".format(pages, reader)) diff --git a/tests/api_tests/test_api_pdf_with_text.py b/tests/api_tests/test_api_pdf_with_text.py new file mode 100644 index 00000000..e4bc8d64 --- /dev/null +++ b/tests/api_tests/test_api_pdf_with_text.py @@ -0,0 +1,131 @@ +import os +import unittest +from typing import List + +from tests.api_tests.abstract_api_test import AbstractTestApiDocReader + + +class TestApiPdfReader(AbstractTestApiDocReader): + + def _get_abs_path(self, file_name: str) -> str: + return os.path.join(self.data_directory_path, "pdf_with_text_layer", file_name) + + def __filter_by_name(self, annotations: List[dict], name: str) -> List[dict]: + return [annotation for annotation in annotations if annotation["name"] == name] + + @unittest.skip("TODO") + def test_pdf_with_text_style(self) -> None: + file_name = "diff_styles.pdf" + result = self._send_request(file_name, dict(pdf_with_text_layer="true", document_type="", + need_pdf_table_analysis="false")) + tree = result["content"]["structure"] + self._check_tree_sanity(tree) + + node = self._get_by_tree_path(tree, "0.0") + self.assertEqual('1.1TimesNewRomanItalicBold20\n', node['text']) + self.assertIn({'start': 0, 'end': 28, "name": "size", 'value': '20.0'}, node['annotations']) + + node = self._get_by_tree_path(tree, "0.1") + annotations_size = self.__filter_by_name(name="size", annotations=node['annotations']) + self.assertIn({'start': 0, 'end': 26, "name": "size", 'value': '16.0'}, annotations_size) + self.assertEqual(len(node['annotations']), 5) + self.assertEqual("Different styles(Arial16):\n", node["text"]) + + node = self._get_by_tree_path(tree, "0.2.2") + self.assertEqual('3. TimesNewRomanItalic14, Calibri18, Tahoma16\n', node['text']) + self.assertEqual('3. ', node['text'][0:3]) + self.assertIn({'start': 0, 'end': 36, 'name': "style", 'value': 'TimesNewRomanPSMT'}, node['annotations']) + self.assertIn({'start': 0, 'end': 2, "name": "size", 'value': '16.0'}, node['annotations']) + self.assertEqual('TimesNewRomanItalic14, ', node['text'][3:26]) + self.assertIn({'start': 0, 'end': 36, "name": "style", 'value': 'TimesNewRomanPSMT'}, node['annotations']) + self.assertIn({'start': 3, 'end': 25, "name": "size", 'value': '14.0'}, node['annotations']) + self.assertEqual('Calibri18, ', node['text'][26:37]) + self.assertIn({'start': 0, 'end': 36, "name": "style", 'value': 'TimesNewRomanPSMT'}, node['annotations']) + self.assertIn({'start': 26, 'end': 36, 'value': '18.0', "name": "size"}, node['annotations']) + self.assertEqual('Tahoma16\n', node['text'][37:46]) + self.assertIn({'start': 37, 'end': 45, 'value': 'Tahoma', "name": "style"}, node['annotations']) + self.assertIn({'start': 37, 'end': 45, "name": "size", 'value': '16.0'}, node['annotations']) + self.assertEqual(9, len(node['annotations'])) + + def test_pdf_with_text_style_2(self) -> None: + file_name = "2-column-state.pdf" + result = self._send_request(file_name, dict(pdf_with_text_layer="true", need_pdf_table_analysis="false")) + tree = result["content"]["structure"] + self._check_tree_sanity(tree) + subs = tree['subparagraphs'] + sub = self._get_by_tree_path(tree, "0.0") + self.assertEqual("Compromising Tor Anonymity\n", sub['text'][0:27]) + annotations_size = self.__filter_by_name(name="size", annotations=subs[0]['annotations']) + self.assertIn({'start': 0, 'end': 61, "name": "size", 'value': '18.0'}, annotations_size) + + annotations_style = self.__filter_by_name(name="style", annotations=subs[0]['annotations']) + self.assertIn({'start': 0, 'end': 61, 'name': 'style', 'value': 'Helvetica-Bold'}, annotations_style) + + annotations_bold = self.__filter_by_name(name="bold", annotations=subs[0]['annotations']) + self.assertIn({'start': 0, 'end': 61, 'name': 'bold', 'value': "True"}, annotations_bold) + + self.assertIn("Pere Manils, Abdelberi Chaabane, Stevens Le Blond,", self._get_by_tree_path(tree, "0.1")["text"]) + + @unittest.skip("TODO") + def test_pdf_with_2_columns_text(self) -> None: + file_name = "2-column-state.pdf" + result = self._send_request(file_name, dict(pdf_with_text_layer="true", document_type="", + need_pdf_table_analysis="false")) + + tree = result["content"]["structure"] + self._check_tree_sanity(tree) + self.assertIn("Privacy of users in P2P networks goes far beyond their\n" + "current usage and is a fundamental requirement to the adop-\n" + "tion of P2P protocols for legal usage. In a climate of cold", + self._get_by_tree_path(tree, "0.5")['text']) + + self.assertIn("Keywords", self._get_by_tree_path(tree, "0.6")['text']) + self.assertIn("Anonymizing Networks, Privacy, Tor, BitTorrent", self._get_by_tree_path(tree, "0.7")['text']) + + self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.8.0.0")["text"]) + self.assertIn("The Tor network was designed to provide freedom\n" + "of speech by guaranteeing anonymous communications.\n" + "Whereas the cryptographic foundations of Tor, based on\n" + "onion-routing [3, 9, 22, 24], are known to be robust, identity", + self._get_by_tree_path(tree, "0.8.0.1")["text"]) + + def test_pdf_with_2_columns_text_2(self) -> None: + file_name = "liters_state.pdf" + result = self._send_request(file_name, dict(pdf_with_text_layer="true", need_pdf_table_analysis="false")) + + tree = result["content"]["structure"] + + self.assertIn("References", self._get_by_tree_path(tree, "0.0")["text"]) + self.assertIn("[1] Navaneeth Bodla, Bharat Singh, Rama Chellappa, and", + self._get_by_tree_path(tree, "0.1")["text"]) + + def test_pdf_with_some_tables(self) -> None: + file_name = "VVP_6_tables.pdf" + result = self._send_request(file_name, dict(pdf_with_text_layer="true")) + content = result["content"] + self._test_table_refs(content) + tree = content["structure"] + self._check_tree_sanity(tree) + + # checks indentations + par = self._get_by_tree_path(tree, "0.4.0.0") + self.assertIn({'end': 170, 'value': '600', 'name': 'indentation', 'start': 0}, par["annotations"]) + self.assertIn("Методика расчета ВВП по доходам характеризуется суммой национального\n", par["text"]) + + def test_pdf_with_only_table(self) -> None: + file_name = "VVP_global_table.pdf" + result = self._send_request(file_name, dict(pdf_with_text_layer="true")) + + self.assertTrue(result["content"]["tables"][0]["metadata"]["uid"] == + result["content"]["structure"]["subparagraphs"][0]["annotations"][0]["value"]) + + def test_pdf_with_only_mp_table(self) -> None: + file_name = os.path.join("..", "tables", "multipage_table.pdf") + result = self._send_request(file_name, dict(pdf_with_text_layer="true", need_header_footer_analysis=True)) + + table_refs = [ann["value"] for ann in result["content"]["structure"]["subparagraphs"][0]["annotations"] + if ann["name"] == "table"] + + self.assertTrue(len(result["content"]["tables"]), len(table_refs)) + for table in result["content"]["tables"]: + self.assertTrue(table["metadata"]["uid"] in table_refs) diff --git a/tests/test_style.py b/tests/test_style.py deleted file mode 100644 index 2b670bea..00000000 --- a/tests/test_style.py +++ /dev/null @@ -1,128 +0,0 @@ -# noqa -import os -import re -import unittest -import warnings -from typing import List, Tuple -import pycodestyle -from flake8.api import legacy as flake8 - - -class TestCodeFormat(unittest.TestCase): - - @staticmethod - def get_files() -> List[str]: - path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "dedoc")) - files = [] - for root, _, filenames in os.walk(path): - for file in filenames: - if not file.endswith(".py"): - continue - with open(os.path.join(root, file), "r") as f: - first_line = f.readline() - # ignore files with "noqa" in the beginning - if "# noqa" not in first_line: - files.append(os.path.join(root, file)) - return files - - def test_pep8_conformance(self) -> None: - """Test that we conform to PEP8.""" - - files = self.get_files() - ignore = [ - "E501", # skip line length check because 79 is not enough - "W504" # we choose this from 503 and 504 - ] - pep8style = pycodestyle.StyleGuide(quiet=True, ignore=ignore) - file_check = pep8style.check_files(files) - if file_check.total_errors > 0: - print("GET {} ERRORS".format(file_check.total_errors)) # noqa - pep8style = pycodestyle.StyleGuide(quiet=False, ignore=ignore) - pep8style.check_files(files) - - self.assertEqual(0, file_check.total_errors, "some file contains errors. To skip line use # noqa") - - def test_forgotten_print(self) -> None: - """tests that we did not forget some prints in code, - It's ok use print in scripts, if you want to use print in other places mark them with # noqa - """ - files = self.get_files() - print_regexp = re.compile(r"\s*print\(") - prints_cnt = 0 - for file_path in files: - with open(file_path) as file: - for line_id, line in enumerate(file): - if print_regexp.match(line) and "scripts/" not in file_path: - if "# noqa" not in line: - warnings.warn("seems you forgot print in \n{}:{}".format(file_path, line_id)) - prints_cnt += 1 - self.assertEqual(0, prints_cnt) - - def test_flake(self) -> None: - """Test that we conform to flake.""" - style_guide = flake8.get_style_guide(ignore=["E501", "W504", "ANN101", "TYP101"]) - files = self.get_files() - errors = style_guide.check_files(files) - self.assertEqual(0, errors.total_errors) - - def test_forgotten_removes(self) -> None: - broken_files = 0 - project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../src")) - for path in self.get_files(): - if "test_style.py" in path: - continue - with open(path) as file: - for line_id, line in enumerate(file): - if "#TODOREMOVE" in line.upper().replace(" ", "") and "# noqa" in line.lower(): # noqa - print("{}:{}".format(path[len(project_root):], line_id + 1)) # noqa - broken_files += 1 - self.assertEqual(0, broken_files) - - def test_imports(self) -> None: - too_many_blocks = [] - import_not_from_root = [] - for file in self.get_files(): - self.__check_imports(file, import_not_from_root, too_many_blocks) - if len(too_many_blocks) > 0: - res = "\n\n" - for line in too_many_blocks: - res += line + ":1\n" - warnings.warn(res) - self.assertEqual(0, len(too_many_blocks)) - if len(import_not_from_root) > 0: - res = "\n\n" - for line_id, line in import_not_from_root: - res += line + ":{}\n".format(line_id + 1) - warnings.warn(res) - self.assertEqual(0, len(import_not_from_root)) - - def __check_imports(self, file: str, import_not_from_root: list, too_many_blocks: list) -> None: - blocks = self._get_import_blocks(file) - if len(blocks) > 3: - too_many_blocks.append(file) - prefix_for_local_imports = ("dedoc", "tests", "config") - if len(blocks) == 3: - for line_id, line in blocks[2]: - local_imports_correct = line.split()[1].startswith(prefix_for_local_imports) - if not local_imports_correct: - import_not_from_root.append((line_id, file)) - - def _get_import_blocks(self, file: str) -> List[List[Tuple[int, str]]]: - blocks = [] - block = [] - prev_line = "" - with open(file) as in_file: - for line_id, line in enumerate(in_file): - if prev_line.strip().endswith("\\"): - prev_line = line - continue - elif line.strip().startswith("#"): - continue - elif line.strip() == "": - if len(block) > 0: - blocks.append(block) - block = [] - else: - block.append((line_id, line)) - prev_line = line - return [block for block in blocks if all(line.startswith(("import", "from")) for _, line in block)] diff --git a/tests/test_utils.py b/tests/test_utils.py index d86c6872..f611c24f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,10 +1,10 @@ import importlib import os import signal -from typing import Union, List, Optional, Any +from typing import Any, List, Optional, Union -from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.bbox import BBox +from dedoc.data_structures.line_metadata import LineMetadata from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.tables.location import Location @@ -31,36 +31,28 @@ def get_test_config() -> dict: return config -def create_line_by_coordinates(x_top_left: int, - y_top_left: int, - width: int, - height: int, - page: int) -> LineWithLocation: +def create_line_by_coordinates(x_top_left: int, y_top_left: int, width: int, height: int, page: int) -> LineWithLocation: bbox = BBox(x_top_left=x_top_left, y_top_left=y_top_left, width=width, height=height) location = Location(bbox=bbox, page_number=page) - line = LineWithLocation( - line="Some text", - metadata=LineMetadata(page_id=page, line_id=0), - annotations=[], - location=location) + line = LineWithLocation(line="Some text", metadata=LineMetadata(page_id=page, line_id=0), annotations=[], location=location) return line class TestTimeout: def __init__(self, seconds: int, error_message: Optional[str] = None) -> None: if error_message is None: - error_message = 'tests timed out after {}s.'.format(seconds) + error_message = f"tests timed out after {seconds}s." self.seconds = seconds self.error_message = error_message - def handle_timeout(self, signum: Any, frame: Any) -> None: + def handle_timeout(self, signum: Any, frame: Any) -> None: # noqa raise Exception(self.error_message) def __enter__(self) -> None: signal.signal(signal.SIGALRM, self.handle_timeout) signal.alarm(self.seconds) - def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: # noqa signal.alarm(0) diff --git a/tests/unit_tests/abstract_converter_test.py b/tests/unit_tests/abstract_converter_test.py index 30644646..cd8e16fb 100644 --- a/tests/unit_tests/abstract_converter_test.py +++ b/tests/unit_tests/abstract_converter_test.py @@ -27,8 +27,8 @@ def _convert(self, filename: str, extension: str, converter: AbstractConverter) filename_with_extension = filename + extension file = os.path.join(self.path, filename_with_extension) tmp_file = os.path.join(self.tmp_dir.name, filename_with_extension) - self.assertTrue(os.path.isfile(file), "no such file {}".format(file)) + self.assertTrue(os.path.isfile(file), f"no such file {file}") shutil.copy(file, tmp_file) result = converter.do_convert(tmp_dir=self.tmp_dir.name, filename=filename, extension=extension) path = os.path.join(self.tmp_dir.name, result) - self.assertTrue(os.path.isfile(path), "no such file {}".format(path)) + self.assertTrue(os.path.isfile(path), f"no such file {path}") diff --git a/tests/unit_tests/test_classifier_txt_reader.py b/tests/unit_tests/test_classifier_txt_reader.py new file mode 100644 index 00000000..7b82377c --- /dev/null +++ b/tests/unit_tests/test_classifier_txt_reader.py @@ -0,0 +1,31 @@ +import os +from unittest import TestCase + +from dedoc.readers.txt_reader.raw_text_reader import RawTextReader +from tests.test_utils import get_test_config + + +class TestClassifierTxtReader(TestCase): + config = get_test_config() + reader = RawTextReader(config=config) + path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data")) + + def test_read_law(self) -> None: + file = os.path.join(self.path, "laws", "коап_москвы_8_7_2015_utf.txt") + uids_set = set() + prefix = "txt_6210f1fb59150aae33a09f49c8724baf" + document = self.reader.read(file, None, {}) + for line in document.lines: + self.assertNotIn(line.uid, uids_set) + uids_set.add(line.uid) + self.assertEqual(prefix, line.uid[:len(prefix)]) + + def test_read_tz(self) -> None: + file = os.path.join(self.path, "tz", "tz.txt") + uids_set = set() + prefix = "txt_0e576a9e0008225ac27f961af60c0bee" + document = self.reader.read(file, None, {}) + for line in document.lines: + self.assertNotIn(line.uid, uids_set) + uids_set.add(line.uid) + self.assertEqual(prefix, line.uid[:len(prefix)]) diff --git a/tests/unit_tests/test_doctype_law_dynamic_classifier.py b/tests/unit_tests/test_doctype_law_dynamic_classifier.py index 46c763b4..2ce8bdc9 100644 --- a/tests/unit_tests/test_doctype_law_dynamic_classifier.py +++ b/tests/unit_tests/test_doctype_law_dynamic_classifier.py @@ -1,10 +1,10 @@ import os import unittest +from dedoc.readers.txt_reader.raw_text_reader import RawTextReader from dedoc.structure_extractors.concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor from dedoc.structure_extractors.concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor -from dedoc.readers.txt_reader.raw_text_reader import RawTextReader class TestFoivApiDocreader(unittest.TestCase): @@ -21,54 +21,52 @@ def _get_abs_path(self, file_name: str) -> str: def _test_document_type(self, file_name: str, expected_type: str) -> None: config = {} base_reader = RawTextReader(config=config) - unstructured_document = base_reader.read(path=self._get_abs_path(file_name), - document_type=None, - parameters=None) + unstructured_document = base_reader.read(path=self._get_abs_path(file_name), document_type=None, parameters=None) result = self.structure_extractor._predict_extractor(unstructured_document.lines) self.assertEqual(result.document_type, expected_type) def test_law(self) -> None: - file_name = 'закон.txt' - expected_type = 'law' + file_name = "закон.txt" + expected_type = "law" self._test_document_type(file_name, expected_type) def test_instruction(self) -> None: - file_name = 'инструкция.txt' - expected_type = 'foiv_law' + file_name = "инструкция.txt" + expected_type = "foiv_law" self._test_document_type(file_name, expected_type) def test_codex(self) -> None: - file_name = 'кодекс.txt' - expected_type = 'law' + file_name = "кодекс.txt" + expected_type = "law" self._test_document_type(file_name, expected_type) def test_definition(self) -> None: - file_name = 'определение.txt' - expected_type = 'law' + file_name = "определение.txt" + expected_type = "law" self._test_document_type(file_name, expected_type) def test_resolution(self) -> None: - file_name = 'постановление.txt' - expected_type = 'law' + file_name = "постановление.txt" + expected_type = "law" self._test_document_type(file_name, expected_type) def test_order(self) -> None: - file_name = 'приказ.txt' - expected_type = 'foiv_law' + file_name = "приказ.txt" + expected_type = "foiv_law" self._test_document_type(file_name, expected_type) def test_disposal(self) -> None: - file_name = 'распоряжение.txt' - expected_type = 'law' + file_name = "распоряжение.txt" + expected_type = "law" self._test_document_type(file_name, expected_type) def test_decree(self) -> None: - file_name = 'указ.txt' - expected_type = 'law' + file_name = "указ.txt" + expected_type = "law" self._test_document_type(file_name, expected_type) def test_fz(self) -> None: - file_name = 'федеральный_закон.txt' - expected_type = 'law' + file_name = "федеральный_закон.txt" + expected_type = "law" self._test_document_type(file_name, expected_type) diff --git a/tests/unit_tests/test_doctype_law_structure_extractor.py b/tests/unit_tests/test_doctype_law_structure_extractor.py index e73ab545..7f15e89a 100644 --- a/tests/unit_tests/test_doctype_law_structure_extractor.py +++ b/tests/unit_tests/test_doctype_law_structure_extractor.py @@ -4,15 +4,13 @@ from collections import defaultdict from typing import List -from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor -from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.body_law_hierarchy_level_builder import \ - BodyLawHierarchyLevelBuilder - -from tests.test_utils import get_test_config, TestTimeout +from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.body_law_hierarchy_level_builder import BodyLawHierarchyLevelBuilder +from tests.test_utils import TestTimeout, get_test_config class TestLawStructureExtractor(unittest.TestCase): @@ -26,11 +24,9 @@ def test_item(self) -> None: self.assertEqual("item", hl.line_type) hl, _ = self.body_builder._line_2level(text="3) пункт первый", label="structure_unit", init_hl_depth=self.depth) self.assertEqual("item", hl.line_type) - hl, _ = self.body_builder._line_2level(text="1000) пункт первый", label="structure_unit", - init_hl_depth=self.depth) + hl, _ = self.body_builder._line_2level(text="1000) пункт первый", label="structure_unit", init_hl_depth=self.depth) self.assertEqual("item", hl.line_type) - hl, _ = self.body_builder._line_2level(text="1.1.3) пункт первый", label="structure_unit", - init_hl_depth=self.depth) + hl, _ = self.body_builder._line_2level(text="1.1.3) пункт первый", label="structure_unit", init_hl_depth=self.depth) self.assertEqual("item", hl.line_type) def test_subitem(self) -> None: @@ -46,11 +42,9 @@ def test_article_part(self) -> None: self.assertEqual("articlePart", hl.line_type) hl, _ = self.body_builder._line_2level(text="2. пункт первый", label="structure_unit", init_hl_depth=self.depth) self.assertEqual("articlePart", hl.line_type) - hl, _ = self.body_builder._line_2level(text="1.2.1. пункт первый", label="structure_unit", - init_hl_depth=self.depth) + hl, _ = self.body_builder._line_2level(text="1.2.1. пункт первый", label="structure_unit", init_hl_depth=self.depth) self.assertEqual("articlePart", hl.line_type) - hl, _ = self.body_builder._line_2level(text="1.2.1.2. пункт первый", label="structure_unit", - init_hl_depth=self.depth) + hl, _ = self.body_builder._line_2level(text="1.2.1.2. пункт первый", label="structure_unit", init_hl_depth=self.depth) self.assertEqual("articlePart", hl.line_type) def test_begin_application(self) -> None: @@ -61,20 +55,17 @@ def test_begin_application(self) -> None: self.assertIsNotNone(self.structure_extractor.classifier.regexp_application_begin.match(application_start.lower())) def test_string_number_correctness_with_regexp(self) -> None: - lines = ['03.06.2009 № 17, от 07.10.2009 № 42, от 10.03.2010 № 6, от 14.04.2010 № 11, от', - 'правонарушениях. (В редакции Закона Москвы от 24.06.2015 г. № 39)', - '2. Нарушение административного регламента', - '1.2.2)', '1.2.4.6}', '1.23.005 ', '1.4.5 ', '1.4.5\n', '1.5.6.Закон о ....'] - answers = [False, - False, - True, - True, True, False, True, True, True] + lines = ["03.06.2009 № 17, от 07.10.2009 № 42, от 10.03.2010 № 6, от 14.04.2010 № 11, от", + "правонарушениях. (В редакции Закона Москвы от 24.06.2015 г. № 39)", + "2. Нарушение административного регламента", + "1.2.2)", "1.2.4.6}", "1.23.005 ", "1.4.5 ", "1.4.5\n", "1.5.6.Закон о ...."] + answers = [False, False, True, True, True, False, True, True, True] for num, line in enumerate(lines): self.assertEqual(answers[num], self.structure_extractor.regexps_part.match(line) is not None, line) def test_number_ends(self) -> None: - numbers = ["1.2.2) ", '1.4.5Д', '1.5.6.Н', '1.2.4.6} '] + numbers = ["1.2.2) ", "1.4.5Д", "1.5.6.Н", "1.2.4.6} "] without_ends = ["1.2.2)", "1.4.5", "1.5.6.", "1.2.4.6}"] for num, number in enumerate(numbers): @@ -109,19 +100,16 @@ def test_empty_document(self) -> None: self.assertListEqual([], self.structure_extractor.classifier.predict([])) def test_fix_labels(self) -> None: - labels = ["title", "raw_text", "title", "structure_unit", "title", "cellar", "structure_unit", "cellar", - "application"] - labels_expected = ["title", "title", "title", "structure_unit", "raw_text", "raw_text", "structure_unit", - "cellar", "application"] + labels = ["title", "raw_text", "title", "structure_unit", "title", "cellar", "structure_unit", "cellar", "application"] + labels_expected = ["title", "title", "title", "structure_unit", "raw_text", "raw_text", "structure_unit", "cellar", "application"] self.assertListEqual(labels_expected, self.__fix_labels(labels)) labels = ["title", "structure_unit", "application", "structure_unit"] self.assertListEqual(labels, self.__fix_labels(labels)) - labels = ["structure_unit", "application", "title", "cellar", "title", "application", "structure_unit", - "structure_unit", "structure_unit", "title"] - labels_expected = ["structure_unit", "application", "raw_text", "raw_text", "raw_text", "application", - "structure_unit", "structure_unit", "structure_unit", "raw_text"] + labels = ["structure_unit", "application", "title", "cellar", "title", "application", "structure_unit", "structure_unit", "structure_unit", "title"] + labels_expected = ["structure_unit", "application", "raw_text", "raw_text", "raw_text", "application", "structure_unit", "structure_unit", + "structure_unit", "raw_text"] self.assertListEqual(labels_expected, self.__fix_labels(labels)) classes = ["structure_unit", "cellar", "application", "title", "footer"] diff --git a/tests/unit_tests/test_doctype_law_text_features_regexps.py b/tests/unit_tests/test_doctype_law_text_features_regexps.py index 84c1c6ad..688a6ad6 100644 --- a/tests/unit_tests/test_doctype_law_text_features_regexps.py +++ b/tests/unit_tests/test_doctype_law_text_features_regexps.py @@ -7,19 +7,19 @@ class TestLawTextFeaturesRegexps(unittest.TestCase): features = LawTextFeatures() def test_roman_regexp(self) -> None: - self.assertTrue(self.features.roman_regexp.fullmatch(' XI. ')) - self.assertTrue(self.features.roman_regexp.fullmatch(' ') is None) - self.assertTrue(self.features.roman_regexp.fullmatch(' XI.') is None) - self.assertTrue(self.features.roman_regexp.fullmatch('\tIII. ')) + self.assertTrue(self.features.roman_regexp.fullmatch(" XI. ")) + self.assertTrue(self.features.roman_regexp.fullmatch(" ") is None) + self.assertTrue(self.features.roman_regexp.fullmatch(" XI.") is None) + self.assertTrue(self.features.roman_regexp.fullmatch("\tIII. ")) def test_application_beginnings_with_regexp(self) -> None: - self.assertTrue(self.features.regexp_application_begin.fullmatch('приложение')) - self.assertTrue(self.features.regexp_application_begin.fullmatch('Приложение')) - self.assertTrue(self.features.regexp_application_begin.fullmatch('утверждены')) - self.assertTrue(self.features.regexp_application_begin.fullmatch('приложение к приказу')) - self.assertTrue(self.features.regexp_application_begin.fullmatch('приложение к постановлению')) - self.assertTrue(self.features.regexp_application_begin.fullmatch('постановление') is None) - self.assertTrue(self.features.regexp_application_begin.fullmatch('к приказу') is None) + self.assertTrue(self.features.regexp_application_begin.fullmatch("приложение")) + self.assertTrue(self.features.regexp_application_begin.fullmatch("Приложение")) + self.assertTrue(self.features.regexp_application_begin.fullmatch("утверждены")) + self.assertTrue(self.features.regexp_application_begin.fullmatch("приложение к приказу")) + self.assertTrue(self.features.regexp_application_begin.fullmatch("приложение к постановлению")) + self.assertTrue(self.features.regexp_application_begin.fullmatch("постановление") is None) + self.assertTrue(self.features.regexp_application_begin.fullmatch("к приказу") is None) def test_chapter_beginnings(self) -> None: # note to rewrites this test if we change the num of regexps @@ -27,15 +27,16 @@ def test_chapter_beginnings(self) -> None: regexp = LawTextFeatures.named_regexp[0] - lines = ["глава v. международное сотрудничество российской\n", - 'глава vi. ответственность за нарушение\n', - 'глава 17. вступление в силу настоящего федерального закона\n', - 'глава 1. общие положения\n', - 'глава 9. финансирование в области\n', - 'глава 10. заключительные и переходные положения\n', - 'глава 7. государственное регулирование внешнеторговой\n', - 'глава 8. особые виды |\n', - 'глава 2. принципы и условия обработки персональных данных\n' - ] + lines = [ + "глава v. международное сотрудничество российской\n", + "глава vi. ответственность за нарушение\n", + "глава 17. вступление в силу настоящего федерального закона\n", + "глава 1. общие положения\n", + "глава 9. финансирование в области\n", + "глава 10. заключительные и переходные положения\n", + "глава 7. государственное регулирование внешнеторговой\n", + "глава 8. особые виды |\n", + "глава 2. принципы и условия обработки персональных данных\n" + ] for line in lines: - self.assertTrue(regexp.match(line), "doesn't match on\n ''{}''".format(line)) + self.assertTrue(regexp.match(line), f"doesn't match on\n ''{line}''") diff --git a/tests/unit_tests/test_doctype_tz_feature_extractor.py b/tests/unit_tests/test_doctype_tz_feature_extractor.py index acebd394..c985f96c 100644 --- a/tests/unit_tests/test_doctype_tz_feature_extractor.py +++ b/tests/unit_tests/test_doctype_tz_feature_extractor.py @@ -3,7 +3,6 @@ from dedoc.readers.docx_reader.docx_reader import DocxReader from dedoc.structure_extractors.feature_extractors.tz_feature_extractor import TzTextFeatures - from tests.test_utils import get_test_config @@ -44,7 +43,7 @@ def test_end_regexp(self) -> None: self.assertEqual(0, sum(self.feature_extractor._end_regexp(line2))) def test_named_item_regexp(self) -> None: - self.assertTrue(self.feature_extractor.named_item_regexp.fullmatch('раздел')) - self.assertTrue(self.feature_extractor.named_item_regexp.fullmatch('подраздел')) - self.assertTrue(self.feature_extractor.named_item_regexp.fullmatch('подраздел \t ')) - self.assertTrue(self.feature_extractor.named_item_regexp.fullmatch('разделывать') is None) + self.assertTrue(self.feature_extractor.named_item_regexp.fullmatch("раздел")) + self.assertTrue(self.feature_extractor.named_item_regexp.fullmatch("подраздел")) + self.assertTrue(self.feature_extractor.named_item_regexp.fullmatch("подраздел \t ")) + self.assertTrue(self.feature_extractor.named_item_regexp.fullmatch("разделывать") is None) diff --git a/tests/unit_tests/test_font_classifier.py b/tests/unit_tests/test_font_classifier.py new file mode 100644 index 00000000..d721f6c4 --- /dev/null +++ b/tests/unit_tests/test_font_classifier.py @@ -0,0 +1,42 @@ +import os +import unittest + +from PIL import Image + +from dedoc.data_structures.bbox import BBox +from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox +from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox +from dedoc.readers.pdf_reader.pdf_image_reader.line_metadata_extractor.font_type_classifier import FontTypeClassifier +from tests.test_utils import get_test_config + + +class TestFontClassifier(unittest.TestCase): + + data_directory_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "scanned")) + dirname = os.path.dirname(__file__) + path_model = os.path.abspath(os.path.join(get_test_config()["resources_path"], "font_classifier.pth")) + classifier = FontTypeClassifier(path_model) + + def get_page(self) -> PageWithBBox: + image = Image.open(os.path.join(self.data_directory_path, "orient_1.png")) + + bbox_1 = TextWithBBox(bbox=BBox(10, 20, 11, 23), page_num=0, text="str", line_num=0) + bbox_2 = TextWithBBox(bbox=BBox(20, 30, 11, 23), page_num=0, text="rts", line_num=1) + bboxes = [bbox_1, bbox_2] + + return PageWithBBox(image=image, bboxes=bboxes, page_num=0) + + def test__page2tensor(self) -> None: + page = self.get_page() + tensor = FontTypeClassifier._page2tensor(page=page) + bbox_num, channels, height, width = tensor.shape + self.assertEqual(2, bbox_num) + self.assertEqual(3, channels) + self.assertEqual(15, height) + self.assertEqual(300, width) + + def test__get_model_predictions(self) -> None: + page = self.get_page() + predictions = self.classifier._get_model_predictions(page) + self.assertEqual(predictions.shape[0], 2) + self.assertEqual(len(predictions.shape), 2) diff --git a/tests/unit_tests/test_format_docx_reader.py b/tests/unit_tests/test_format_docx_reader.py index f0ad4cfd..3694d7da 100644 --- a/tests/unit_tests/test_format_docx_reader.py +++ b/tests/unit_tests/test_format_docx_reader.py @@ -4,8 +4,8 @@ from tempfile import TemporaryDirectory from dedoc.config import get_config -from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor from dedoc.readers.docx_reader.docx_reader import DocxReader @@ -106,15 +106,15 @@ def test_caps_letters1(self) -> None: docx_reader = DocxReader(config=get_config()) path = self._get_path("caps_1.docx") result = docx_reader.read(path) - self.assertEqual('ШИЖМАШ МОГАЙ ЛИЕШ ГЫН? ', result.lines[2].line) - self.assertEqual('АНАСТАСИЯ АЙГУЗИНА', result.lines[3].line) + self.assertEqual("ШИЖМАШ МОГАЙ ЛИЕШ ГЫН? ", result.lines[2].line) + self.assertEqual("АНАСТАСИЯ АЙГУЗИНА", result.lines[3].line) def test_caps_letters2(self) -> None: docx_reader = DocxReader(config=get_config()) path = self._get_path("caps_2.docx") result = docx_reader.read(path) self.assertEqual('И. Одар "Таргылтыш"\n', result.lines[0].line) - self.assertEqual('I глава\n', result.lines[2].line) + self.assertEqual("I глава\n", result.lines[2].line) def test_justification(self) -> None: docx_reader = DocxReader(config=get_config()) @@ -222,7 +222,8 @@ def test_tables_with_merged_cells(self) -> None: docx_reader = DocxReader(config=get_config()) path = self._get_path("big_table_with_merged_cells.docx") result = docx_reader.read(path) - hidden_cells_big_table = [(0, 1), (0, 2), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (3, 1), (3, 2), (3, 3), (4, 0), (4, 1), (4, 2), (4, 3), (5, 0), (5, 1), (5, 2), (5, 3), (5, 6), (5, 7), (5, 8), (5, 9)] + hidden_cells_big_table = [(0, 1), (0, 2), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (3, 1), (3, 2), (3, 3), + (4, 0), (4, 1), (4, 2), (4, 3), (5, 0), (5, 1), (5, 2), (5, 3), (5, 6), (5, 7), (5, 8), (5, 9)] for i, j in hidden_cells_big_table: self.assertTrue(result.tables[0].metadata.cell_properties[i][j].invisible) self.assertEqual(result.tables[0].metadata.cell_properties[i][j].rowspan, 1) diff --git a/tests/unit_tests/test_format_image_reader_bbox.py b/tests/unit_tests/test_format_image_reader_bbox.py index 882b6b6c..497d11fc 100644 --- a/tests/unit_tests/test_format_image_reader_bbox.py +++ b/tests/unit_tests/test_format_image_reader_bbox.py @@ -1,7 +1,8 @@ +import os import re import unittest + import cv2 -import os from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor from tests.test_utils import get_test_config @@ -27,27 +28,20 @@ def test_line_order(self) -> None: self.assertEqual("экспортному контролю подключения значимого объекта критической", bboxes[6].text.strip()) self.assertEqual("информационной инфраструктуры Российской Федерации", bboxes[7].text.strip()) self.assertEqual("к сети связи общего пользования", bboxes[8].text.strip()) - self.assertEqual("критической информационной инфраструктуры Российской Федерации (далее", - bboxes[10].text.strip()) + self.assertEqual("критической информационной инфраструктуры Российской Федерации (далее", bboxes[10].text.strip()) self.assertEqual("субъект критической информационной инфраструктуры) с ФСТЭК России", bboxes[11].text.strip()) - self.assertEqual("подключения значимого объекта критической информационной инфраструктуры", - bboxes[12].text.strip()) - self.assertEqual("пользования, осуществляемого в соответствии с пунктом 3 Правил подготовки и", - bboxes[14].text.strip()) - self.assertEqual("использования ресурсов единой сети электросвязи Российской Федерации для", - bboxes[15].text.strip()) + self.assertEqual("подключения значимого объекта критической информационной инфраструктуры", bboxes[12].text.strip()) + self.assertEqual("пользования, осуществляемого в соответствии с пунктом 3 Правил подготовки и", bboxes[14].text.strip()) + self.assertEqual("использования ресурсов единой сети электросвязи Российской Федерации для", bboxes[15].text.strip()) self.assertEqual("обеспечения функционирования значимых объектов критической", bboxes[16].text.strip()) self.assertEqual("информационной инфраструктуры, утвержденных постановлением", bboxes[17].text.strip()) self.assertEqual("Правительства Российской Федерации от 8 июня 2019 г №743 (Собрание", bboxes[18].text.strip()) - self.assertEqual("законодательства Российской Федерации, 2019, № 24, ст. 3099) (далее", - bboxes[19].text.strip()) + self.assertEqual("законодательства Российской Федерации, 2019, № 24, ст. 3099) (далее", bboxes[19].text.strip()) self.assertEqual("Правила).", bboxes[20].text.strip()) self.assertEqual("2. Согласование подключения создаваемого значимого объекта", bboxes[21].text.strip()) - self.assertEqual("осуществляется до ввода его в действие на этапе, определяемом субъектом", - bboxes[22].text.strip()) + self.assertEqual("осуществляется до ввода его в действие на этапе, определяемом субъектом", bboxes[22].text.strip()) self.assertEqual("критической информационной инфраструктуры. Согласование подключения", bboxes[23].text.strip()) - self.assertEqual("действующего значимого объекта осуществляется до заключения договора с", - bboxes[24].text.strip()) + self.assertEqual("действующего значимого объекта осуществляется до заключения договора с", bboxes[24].text.strip()) self.assertEqual("оператором связи, предусмотренного пунктом 6 Правил.", bboxes[25].text.strip()) self.assertEqual("3.В случае если значимый объект на момент его включения в реестр", bboxes[26].text.strip()) self.assertEqual("значимых критической информационной инфраструктуры", bboxes[27].text.strip()) @@ -55,14 +49,10 @@ def test_line_order(self) -> None: self.assertEqual("информационной инфраструктуры) подключен к сети связи общего", bboxes[29].text.strip()) self.assertEqual("пользования, согласование ФСТЭК России в соответствии с настоящим", bboxes[30].text.strip()) self.assertEqual("Порядком не требуется.", bboxes[31].text.strip()) - self.assertEqual("4. Для согласования подключения значимого объекта к сети связи общего", - bboxes[32].text.strip()) + self.assertEqual("4. Для согласования подключения значимого объекта к сети связи общего", bboxes[32].text.strip()) self.assertEqual("пользования субъект критической информационной инфраструктуры", bboxes[33].text.strip()) self.assertEqual("представляет посредством почтового отправления или непосредственно", bboxes[34].text.strip()) self.assertEqual("в ФСТЭК России следующие сведения:", bboxes[35].text.strip()) - self.assertEqual("Порядок ведения реестра значимых объектов критической информационной инфраструкту-", - bboxes[36].text.strip()) - self.assertEqual("№227 «Об утверждении Порядка ведения реестра значимых объектов критической информа-", - bboxes[38].text.strip()) - self.assertEqual("ционной нифраструктуры Российской Федерации» (зарегистрирован Минюстом России", - bboxes[39].text.strip()) + self.assertEqual("Порядок ведения реестра значимых объектов критической информационной инфраструкту-", bboxes[36].text.strip()) + self.assertEqual("№227 «Об утверждении Порядка ведения реестра значимых объектов критической информа-", bboxes[38].text.strip()) + self.assertEqual("ционной нифраструктуры Российской Федерации» (зарегистрирован Минюстом России", bboxes[39].text.strip()) diff --git a/tests/unit_tests/test_format_pdf_reader.py b/tests/unit_tests/test_format_pdf_reader.py index 4bf2cb4a..e928ed7e 100644 --- a/tests/unit_tests/test_format_pdf_reader.py +++ b/tests/unit_tests/test_format_pdf_reader.py @@ -1,10 +1,11 @@ import os +import re import shutil import unittest from tempfile import TemporaryDirectory from typing import List + import cv2 -import re from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier @@ -27,7 +28,7 @@ def _split_lines_on_pages(self, lines: List[LineWithMeta]) -> List[List[str]]: def test_scan_rotator(self) -> None: scan_rotator = ScanRotator(config=get_test_config()) - imgs_path = [f'../data/scan_rotator/rotated_{i}.jpg' for i in range(1, 5)] + imgs_path = [f"../data/scan_rotator/rotated_{i}.jpg" for i in range(1, 5)] angles = [0.061732858955328755, -0.017535263190370427, 0.12228411148417097, 0] for i in range(len(imgs_path)): @@ -40,7 +41,7 @@ def test_scan_rotator(self) -> None: def test_scan_orientation(self) -> None: scan_rotator = ScanRotator(config=get_test_config()) - imgs_path = [f'../data/scanned/orient_{i}.png'for i in range(1, 5)] + imgs_path = [f"../data/scanned/orient_{i}.png"for i in range(1, 5)] angles = [90.0, 90.0, 270.0, 270.0] max_delta = 10.0 for i in range(len(imgs_path)): @@ -63,7 +64,7 @@ def test_header_footer_search(self) -> None: lines_by_page = self._split_lines_on_pages(result.lines) - headers = [lines[0] for lines in lines_by_page if lines[0] == 'Richelieu Bond \n'] + headers = [lines[0] for lines in lines_by_page if lines[0] == "Richelieu Bond \n"] footers = [lines[-1] for lines in lines_by_page if re.match(r"^\s*-( )*[0-9]+( )*-\s*$", lines[-1])] self.assertEqual(len(headers), 0) @@ -82,10 +83,8 @@ def test_header_footer_search_2(self) -> None: lines_by_page = self._split_lines_on_pages(result.lines) - headers = [lines[0] for lines in lines_by_page - if lines[0] == 'Richelieu Bond \n'] - footers = [lines[-1] for lines in lines_by_page - if re.match(r"^\s*-( )*[0-9]+( )*-\s*$", lines[-1])] + headers = [lines[0] for lines in lines_by_page if lines[0] == "Richelieu Bond \n"] + footers = [lines[-1] for lines in lines_by_page if re.match(r"^\s*-( )*[0-9]+( )*-\s*$", lines[-1])] self.assertEqual(len(headers), 0) self.assertEqual(len(footers), 0) @@ -103,10 +102,8 @@ def test_header_footer_search_3(self) -> None: lines_by_page = self._split_lines_on_pages(result.lines) - headers = [lines[0] for lines in lines_by_page - if lines[0] == 'QUEST MANAGEMENT, SICAV\n'] - footers = [lines[-1] for lines in lines_by_page - if re.match(r"^\s*[0-9]\s*$", lines[-1])] + headers = [lines[0] for lines in lines_by_page if lines[0] == "QUEST MANAGEMENT, SICAV\n"] + footers = [lines[-1] for lines in lines_by_page if re.match(r"^\s*[0-9]\s*$", lines[-1])] self.assertEqual(len(headers), 1) self.assertEqual(len(footers), 0) @@ -125,8 +122,7 @@ def test_long_list_in_pdf(self) -> None: self.assertEqual(list_elements[5].line.lower().strip(), "6. июнь") self.assertEqual(list_elements[6].line.lower().strip(), "7. июль") self.assertEqual(list_elements[7].line.lower().strip(), "8. август") - self.assertEqual(list_elements[8].line.lower().strip(), - "9. сентябрь в сентябре, в сентябре много листьев на земле желтые и красные! все такие") + self.assertEqual(list_elements[8].line.lower().strip(), "9. сентябрь в сентябре, в сентябре много листьев на земле желтые и красные! все такие") self.assertEqual(list_elements[9].line.lower().strip(), "разные!") self.assertEqual(list_elements[10].line.lower().strip(), "10. октябрь") self.assertEqual(list_elements[11].line.lower().strip(), "11. ноябрь") diff --git a/tests/unit_tests/test_format_txt_reader.py b/tests/unit_tests/test_format_txt_reader.py index 19f971ea..e7249651 100644 --- a/tests/unit_tests/test_format_txt_reader.py +++ b/tests/unit_tests/test_format_txt_reader.py @@ -1,8 +1,8 @@ import os from unittest import TestCase -from dedoc.readers.txt_reader.raw_text_reader import RawTextReader from dedoc.config import get_config +from dedoc.readers.txt_reader.raw_text_reader import RawTextReader from tests.test_utils import get_test_config @@ -35,5 +35,5 @@ def test_get_lines_with_meta(self) -> None: file = os.path.join(self.path, "txt", "pr_17.txt") reader = RawTextReader(config=get_config()) for line in reader._get_lines_with_meta(path=file, encoding="utf-8"): - expected_uid = "txt_1a3cd561910506d56a65db1d1dcb5049_{}".format(line.metadata.line_id) + expected_uid = f"txt_1a3cd561910506d56a65db1d1dcb5049_{line.metadata.line_id}" self.assertEqual(expected_uid, line.uid) diff --git a/tests/unit_tests/test_law_dynamic_classifier.py b/tests/unit_tests/test_law_dynamic_classifier.py new file mode 100644 index 00000000..49eb9057 --- /dev/null +++ b/tests/unit_tests/test_law_dynamic_classifier.py @@ -0,0 +1,75 @@ +import os +import unittest + +from dedoc.structure_extractors.concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor +from dedoc.structure_extractors.concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor +from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor +from dedoc.readers.txt_reader.raw_text_reader import RawTextReader + + +class TestFoivApiDocreader(unittest.TestCase): + data_path = os.path.join(os.path.dirname(__file__), "..", "data", "laws") + data_path = os.path.abspath(data_path) + law_extractors = { + FoivLawStructureExtractor.document_type: FoivLawStructureExtractor(config={}), + LawStructureExtractor.document_type: LawStructureExtractor(config={}) + } + structure_extractor = ClassifyingLawStructureExtractor(extractors=law_extractors, config={}) + + def _get_abs_path(self, file_name: str) -> str: + return os.path.join(self.data_path, "doctypes", file_name) + + def _test_classifier_type(self, file_name: str, expected_type: str) -> None: + config = {} + base_reader = RawTextReader(config=config) + unstructured_document = base_reader.read(path=self._get_abs_path(file_name), + document_type=None, + parameters=None) + result = self.structure_extractor._predict_extractor(unstructured_document.lines) + + self.assertEqual(result.document_type, expected_type) + + def test_law(self) -> None: + file_name = 'закон.txt' + expected_type = 'law' + self._test_classifier_type(file_name, expected_type) + + def test_instruction(self) -> None: + file_name = 'инструкция.txt' + expected_type = 'foiv_law' + self._test_classifier_type(file_name, expected_type) + + def test_codex(self) -> None: + file_name = 'кодекс.txt' + expected_type = 'law' + self._test_classifier_type(file_name, expected_type) + + def test_definition(self) -> None: + file_name = 'определение.txt' + expected_type = 'law' + self._test_classifier_type(file_name, expected_type) + + def test_resolution(self) -> None: + file_name = 'постановление.txt' + expected_type = 'law' + self._test_classifier_type(file_name, expected_type) + + def test_order(self) -> None: + file_name = 'приказ.txt' + expected_type = 'foiv_law' + self._test_classifier_type(file_name, expected_type) + + def test_disposal(self) -> None: + file_name = 'распоряжение.txt' + expected_type = 'law' + self._test_classifier_type(file_name, expected_type) + + def test_decree(self) -> None: + file_name = 'указ.txt' + expected_type = 'law' + self._test_classifier_type(file_name, expected_type) + + def test_fz(self) -> None: + file_name = 'федеральный_закон.txt' + expected_type = 'law' + self._test_classifier_type(file_name, expected_type) diff --git a/tests/unit_tests/test_line_extractor.py b/tests/unit_tests/test_line_extractor.py new file mode 100644 index 00000000..84f440a7 --- /dev/null +++ b/tests/unit_tests/test_line_extractor.py @@ -0,0 +1,28 @@ +import json +import os +import unittest + +from tests.test_utils import get_test_config +from dedoc.train_dataset.extractors.line_with_meta_extractor import LineWithMetaExtractor + + +class TestLineWithMetaExtractor(unittest.TestCase): + + def test_txt_file(self) -> None: + config = get_test_config() + documents_path = os.path.join(os.path.dirname(__file__), "..", "data", "laws") + documents_path = os.path.abspath(documents_path) + self.assertTrue(os.path.isdir(documents_path)) + path = os.path.join(documents_path, "law_classifier_000000_Bhw.json") + self.assertTrue(os.path.isfile(path)) + extractor = LineWithMetaExtractor(path=path, documents_path=documents_path, config=config) + lines = extractor.create_task() + with open(path) as file: + labels = json.load(file) + labels = {key: value for key, value in labels.items() if not value["data"]["_uid"].endswith("_split")} + + uids_set_real = {line.uid for line in lines} + uids2label = {item["data"]["_uid"]: item["labeled"][0] for item in labels.values()} + self.assertSetEqual(set(uids2label.keys()), uids_set_real) + for line in lines: + self.assertEqual(line.label, uids2label[line.uid]) diff --git a/tests/unit_tests/test_misc_annotations.py b/tests/unit_tests/test_misc_annotations.py index 5f1af8a7..3d91d639 100644 --- a/tests/unit_tests/test_misc_annotations.py +++ b/tests/unit_tests/test_misc_annotations.py @@ -2,8 +2,8 @@ from typing import List, Set, Tuple from dedoc.data_structures.annotation import Annotation -from dedoc.utils.annotation_merger import AnnotationMerger from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor +from dedoc.utils.annotation_merger import AnnotationMerger from tests.test_utils import TestTimeout diff --git a/tests/unit_tests/test_misc_dedoc_manager.py b/tests/unit_tests/test_misc_dedoc_manager.py index d63c8fe2..93755689 100644 --- a/tests/unit_tests/test_misc_dedoc_manager.py +++ b/tests/unit_tests/test_misc_dedoc_manager.py @@ -2,8 +2,8 @@ from unittest import TestCase from dedoc.config import get_config -from dedoc.manager_config import get_manager_config from dedoc.dedoc_manager import DedocManager +from dedoc.manager_config import get_manager_config class TestDedocManager(TestCase): diff --git a/tests/unit_tests/test_misc_feature_extractor.py b/tests/unit_tests/test_misc_feature_extractor.py index 7a954c82..f4d3fd87 100644 --- a/tests/unit_tests/test_misc_feature_extractor.py +++ b/tests/unit_tests/test_misc_feature_extractor.py @@ -1,5 +1,6 @@ import unittest from typing import List + import numpy as np from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor diff --git a/tests/unit_tests/test_misc_line_extractor.py b/tests/unit_tests/test_misc_line_extractor.py index 84f440a7..eec1cfeb 100644 --- a/tests/unit_tests/test_misc_line_extractor.py +++ b/tests/unit_tests/test_misc_line_extractor.py @@ -2,8 +2,8 @@ import os import unittest -from tests.test_utils import get_test_config from dedoc.train_dataset.extractors.line_with_meta_extractor import LineWithMetaExtractor +from tests.test_utils import get_test_config class TestLineWithMetaExtractor(unittest.TestCase): diff --git a/tests/unit_tests/test_misc_line_object_linker.py b/tests/unit_tests/test_misc_line_object_linker.py index 29da4bf9..f21a939a 100644 --- a/tests/unit_tests/test_misc_line_object_linker.py +++ b/tests/unit_tests/test_misc_line_object_linker.py @@ -5,7 +5,7 @@ from dedoc.readers.pdf_reader.pdf_image_reader.line_metadata_extractor.metadata_extractor import LineMetadataExtractor from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.scan_paragraph_classifier_extractor import ScanParagraphClassifierExtractor from dedoc.readers.pdf_reader.utils.line_object_linker import LineObjectLinker -from tests.test_utils import get_test_config, create_line_by_coordinates +from tests.test_utils import create_line_by_coordinates, get_test_config class TestLineObjectLinker(unittest.TestCase): diff --git a/tests/unit_tests/test_misc_line_split.py b/tests/unit_tests/test_misc_line_split.py index 725afc32..885a1a63 100644 --- a/tests/unit_tests/test_misc_line_split.py +++ b/tests/unit_tests/test_misc_line_split.py @@ -4,9 +4,9 @@ from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation -from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta class TestLineSplit(unittest.TestCase): diff --git a/tests/unit_tests/test_misc_line_sum.py b/tests/unit_tests/test_misc_line_sum.py index 0bc1d767..49c26c9e 100644 --- a/tests/unit_tests/test_misc_line_sum.py +++ b/tests/unit_tests/test_misc_line_sum.py @@ -5,8 +5,8 @@ from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation -from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta def _make_line(line: str, annotations: List[Annotation]) -> LineWithMeta: @@ -22,7 +22,7 @@ class TestLineSum(unittest.TestCase): sized_line = _make_line("SmallBig", [SizeAnnotation(0, 5, "8"), SizeAnnotation(5, 8, "14")]) lines = [empty, italic_line, sized_line, bold_line] - def assertAnnotationsEqual(self, expected: List[Annotation], result: List[Annotation]) -> None: + def assert_annotations_equal(self, expected: List[Annotation], result: List[Annotation]) -> None: self.assertEqual(len(expected), len(result)) for annotation in result: self.assertIn(annotation, expected) @@ -35,14 +35,14 @@ def test_empty_nonempty(self) -> None: for non_empty in self.lines: for result in (self.empty + non_empty, non_empty + self.empty): self.assertEqual(non_empty.line, result.line) - self.assertAnnotationsEqual(non_empty.annotations, result.annotations) + self.assert_annotations_equal(non_empty.annotations, result.annotations) def test_sum_with_str(self) -> None: text = "some text" for line in self.lines: result = line + text self.assertEqual(line.line + text, result.line) - self.assertAnnotationsEqual(line.annotations, result.annotations) + self.assert_annotations_equal(line.annotations, result.annotations) def test_line_plus_line(self) -> None: for first in self.lines: @@ -55,4 +55,4 @@ def test_line_plus_line(self) -> None: result = self.bold_line + self.italic_line expected = [BoldAnnotation(0, len(self.bold_line.line), "True"), ItalicAnnotation(4, 10, "True")] - self.assertAnnotationsEqual(expected, result.annotations) + self.assert_annotations_equal(expected, result.annotations) diff --git a/tests/unit_tests/test_misc_list_feature_extractor.py b/tests/unit_tests/test_misc_list_feature_extractor.py index bf7b3c5d..6d3f5a5a 100644 --- a/tests/unit_tests/test_misc_list_feature_extractor.py +++ b/tests/unit_tests/test_misc_list_feature_extractor.py @@ -1,15 +1,16 @@ from unittest import TestCase import numpy as np + from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation -from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.feature_extractors.list_features.list_features_extractor import ListFeaturesExtractor from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix +from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix from dedoc.structure_extractors.feature_extractors.list_features.prefix.empty_prefix import EmptyPrefix from dedoc.structure_extractors.feature_extractors.list_features.prefix.letter_prefix import LetterPrefix -from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix class TestListFeatures(TestCase): @@ -103,7 +104,7 @@ def test_empty_prefix(self) -> None: self.assertEqual(EmptyPrefix(indent=10), self.feature_extractor._get_prefix(line)) def test_get_window(self) -> None: - prefixes = [BracketPrefix("{})".format(i), 1.01 * i) for i in range(0, 300)] + prefixes = [BracketPrefix(f"{i})", 1.01 * i) for i in range(0, 300)] doc_size = len(prefixes) assert doc_size == 300 indents = np.array([prefix.indent for prefix in prefixes]) diff --git a/tests/unit_tests/test_misc_list_patcher.py b/tests/unit_tests/test_misc_list_patcher.py deleted file mode 100644 index 35e04cc8..00000000 --- a/tests/unit_tests/test_misc_list_patcher.py +++ /dev/null @@ -1,61 +0,0 @@ -import unittest -from typing import List, Optional - -from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.data_structures.line_metadata import LineMetadata -from dedoc.structure_constructors.concrete_structure_constructors.list_patcher import ListPatcher -from dedoc.data_structures.hierarchy_level import HierarchyLevel - - -class TestListPatcher(unittest.TestCase): - patcher = ListPatcher() - - def __get_text(self, lines: List[LineWithMeta]) -> List[str]: - return [line.line for line in lines] - - def __get_line(self, text: str, level1: Optional[int], level2: Optional[int], hl: str = "list") -> LineWithMeta: - hierarchy_level = HierarchyLevel(level1, level2, False, hl) - metadata = LineMetadata(line_id=None, page_id=0, hierarchy_level=hierarchy_level) - return LineWithMeta(text, metadata=metadata, annotations=[]) - - def test_correct_list(self) -> None: - line1 = self.__get_line("1 item", 1, 0) - line2 = self.__get_line("2 item", 1, 0) - line3 = self.__get_line("2.1 item", 1, 1) - line4 = self.__get_line("2.2 item", 1, 0) - line5 = self.__get_line("3 item", 1, 0) - lines = [line1, line2, line3, line4, line5] - result = self.patcher.patch(lines) - self.assertListEqual(self.__get_text(lines), self.__get_text(result)) - - def test_hierarchy_level_raw_text(self) -> None: - line1 = self.__get_line("2 item", None, None, HierarchyLevel.raw_text) - line2 = self.__get_line("some item", None, None, HierarchyLevel.raw_text) - line3 = self.__get_line("2 item", None, None, HierarchyLevel.raw_text) - - lines = [line1, line2, line3] - result = self.patcher.patch(lines) - expected = ["2 item", "some item", "2 item"] - self.assertListEqual(expected, self.__get_text(result)) - - def test_empty_list(self) -> None: - lines = [] - result = self.patcher.patch(lines) - self.assertListEqual([], self.__get_text(result)) - - def test_miss_head_element_list1(self) -> None: - line2 = self.__get_line("2 item", 1, 0) - line3 = self.__get_line("2.1 item", 1, 1) - line4 = self.__get_line("2.2 item", 1, 0) - line5 = self.__get_line("3 item", 1, 0) - lines = [line2, line3, line4, line5] - result = self.patcher.patch(lines) - self.assertListEqual(self.__get_text(lines), self.__get_text(result)) - - def test_miss_head_element_list2(self) -> None: - line3 = self.__get_line("2.1 item", 1, 1) - line4 = self.__get_line("2.2 item", 1, 0) - line5 = self.__get_line("3 item", 1, 0) - lines = [line3, line4, line5] - result = self.patcher.patch(lines) - self.assertListEqual(self.__get_text(lines), self.__get_text(result)) diff --git a/tests/unit_tests/test_misc_prefix.py b/tests/unit_tests/test_misc_prefix.py index 29f641bd..ee77673c 100644 --- a/tests/unit_tests/test_misc_prefix.py +++ b/tests/unit_tests/test_misc_prefix.py @@ -2,10 +2,10 @@ from typing import List, Type from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix +from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix from dedoc.structure_extractors.feature_extractors.list_features.prefix.empty_prefix import EmptyPrefix from dedoc.structure_extractors.feature_extractors.list_features.prefix.letter_prefix import LetterPrefix -from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix from dedoc.structure_extractors.feature_extractors.list_features.prefix.prefix import LinePrefix @@ -140,8 +140,8 @@ def test_letter_all_predecessor(self) -> None: for first, second in zip(letters[:-1], letters[1:]): first = LetterPrefix(first, 0) second = LetterPrefix(second, 0) - self.assertTrue(second.predecessor(first), "{} should be predecessor of {}".format(first, second)) - self.assertFalse(first.predecessor(second), "{} should not be predecessor of {}".format(first, second)) + self.assertTrue(second.predecessor(first), f"{first} should be predecessor of {second}") + self.assertFalse(first.predecessor(second), f"{first} should not be predecessor of {second}") def _check_three_prefix(self, one: LinePrefix, three: LinePrefix, two: LinePrefix) -> None: self.assertTrue(two.predecessor(one)) @@ -149,7 +149,7 @@ def _check_three_prefix(self, one: LinePrefix, three: LinePrefix, two: LinePrefi self.assertFalse(one.predecessor(one)) self.assertFalse(one.predecessor(two)) self.assertFalse(one.predecessor(three)) - self.assertFalse(three.predecessor(one), "{} {}".format(three, one)) + self.assertFalse(three.predecessor(one), f"{three} {one}") def test_dotted_predecessor_one_num(self) -> None: one = DottedPrefix("1.", 0) @@ -205,7 +205,7 @@ def test_dotted_predecessor_different_num(self) -> None: self.assertFalse(one.predecessor(two)) def test_dotted_list_regexp(self) -> None: - self.assertTrue(BulletPrefix.regexp.fullmatch(' -')) - self.assertTrue(BulletPrefix.regexp.fullmatch('*')) - self.assertTrue(BulletPrefix.regexp.fullmatch(' ©')) - self.assertTrue(BulletPrefix.regexp.fullmatch(' © ') is None) + self.assertTrue(BulletPrefix.regexp.fullmatch(" -")) + self.assertTrue(BulletPrefix.regexp.fullmatch("*")) + self.assertTrue(BulletPrefix.regexp.fullmatch(" ©")) + self.assertTrue(BulletPrefix.regexp.fullmatch(" © ") is None) diff --git a/tests/unit_tests/test_misc_regexps.py b/tests/unit_tests/test_misc_regexps.py index f7e2b9f4..d6b0418d 100644 --- a/tests/unit_tests/test_misc_regexps.py +++ b/tests/unit_tests/test_misc_regexps.py @@ -1,74 +1,74 @@ import unittest -from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_item, regexps_item_with_bracket, regexps_subitem_with_dots, \ - regexps_subitem_extended, regexps_subitem, regexps_number, regexps_year, regexps_ends_of_number +from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_ends_of_number, regexps_item, regexps_item_with_bracket, \ + regexps_number, regexps_subitem, regexps_subitem_extended, regexps_subitem_with_dots, regexps_year class TestUtilsRegexps(unittest.TestCase): def test_item_regexp(self) -> None: - self.assertTrue(regexps_item.match(' 1. some test text')) - self.assertIsNone(regexps_item.match(' 1.1 some test text')) - self.assertTrue(regexps_item.match('\t1.qwe') is None) - self.assertTrue(regexps_item.match('\t5. qwe')) - self.assertTrue(regexps_item.match('1) somw text') is None) - self.assertTrue(regexps_item.match('edber 1) somw text') is None) + self.assertTrue(regexps_item.match(" 1. some test text")) + self.assertIsNone(regexps_item.match(" 1.1 some test text")) + self.assertTrue(regexps_item.match("\t1.qwe") is None) + self.assertTrue(regexps_item.match("\t5. qwe")) + self.assertTrue(regexps_item.match("1) somw text") is None) + self.assertTrue(regexps_item.match("edber 1) somw text") is None) def test_item_with_bracket_regexp(self) -> None: - self.assertTrue(regexps_item_with_bracket.match(' 1. some test text') is None) - self.assertTrue(regexps_item_with_bracket.match(' 1) some test text')) - self.assertTrue(regexps_item_with_bracket.match(' \t 1} some test text')) - self.assertTrue(regexps_item_with_bracket.match(' \t 4.2.3.4) some test text')) - self.assertTrue(regexps_item_with_bracket.match(' \t 4.234) some test text')) - self.assertTrue(regexps_item_with_bracket.match(' dkjfbe 1. some test text') is None) - self.assertTrue(regexps_item_with_bracket.match('123|') is None) + self.assertTrue(regexps_item_with_bracket.match(" 1. some test text") is None) + self.assertTrue(regexps_item_with_bracket.match(" 1) some test text")) + self.assertTrue(regexps_item_with_bracket.match(" \t 1} some test text")) + self.assertTrue(regexps_item_with_bracket.match(" \t 4.2.3.4) some test text")) + self.assertTrue(regexps_item_with_bracket.match(" \t 4.234) some test text")) + self.assertTrue(regexps_item_with_bracket.match(" dkjfbe 1. some test text") is None) + self.assertTrue(regexps_item_with_bracket.match("123|") is None) def test_subitem_with_dots_regexp(self) -> None: - self.assertTrue(regexps_subitem_with_dots.match('а) текст на русском')) - self.assertTrue(regexps_subitem_with_dots.match(' 123.я. ')) - self.assertTrue(regexps_subitem_with_dots.match(' 123.т.е.с.т.д.л.и.н.н.о.г.о.с.п.и.с.к.а. ')) - self.assertTrue(regexps_subitem_with_dots.match(' 123.456') is None) - self.assertTrue(regexps_subitem_with_dots.match(' 123.ч.и.с.123.л.а. ')) - self.assertTrue(regexps_subitem_with_dots.match('12.б.у.к.в.ы. ')) - self.assertTrue(regexps_subitem_with_dots.match('23.б.у.к.в.ы.') is None) - self.assertTrue(regexps_subitem_with_dots.match(' 123.ч.и.с.123.ла. ') is None) - self.assertTrue(regexps_subitem.match('б)')) - self.assertTrue(regexps_subitem.match('b)') is None) + self.assertTrue(regexps_subitem_with_dots.match("а) текст на русском")) + self.assertTrue(regexps_subitem_with_dots.match(" 123.я. ")) + self.assertTrue(regexps_subitem_with_dots.match(" 123.т.е.с.т.д.л.и.н.н.о.г.о.с.п.и.с.к.а. ")) + self.assertTrue(regexps_subitem_with_dots.match(" 123.456") is None) + self.assertTrue(regexps_subitem_with_dots.match(" 123.ч.и.с.123.л.а. ")) + self.assertTrue(regexps_subitem_with_dots.match("12.б.у.к.в.ы. ")) + self.assertTrue(regexps_subitem_with_dots.match("23.б.у.к.в.ы.") is None) + self.assertTrue(regexps_subitem_with_dots.match(" 123.ч.и.с.123.ла. ") is None) + self.assertTrue(regexps_subitem.match("б)")) + self.assertTrue(regexps_subitem.match("b)") is None) def test_subitem_extended_regexp(self) -> None: - self.assertTrue(regexps_subitem_extended.fullmatch('z)')) - self.assertTrue(regexps_subitem_extended.fullmatch('я}')) - self.assertTrue(regexps_subitem_extended.fullmatch('Q|') is None) + self.assertTrue(regexps_subitem_extended.fullmatch("z)")) + self.assertTrue(regexps_subitem_extended.fullmatch("я}")) + self.assertTrue(regexps_subitem_extended.fullmatch("Q|") is None) def test_subitem_regexp(self) -> None: - self.assertTrue(regexps_subitem.match('а) текст на русском')) - self.assertTrue(regexps_subitem.match(' ё) english text')) - self.assertTrue(regexps_subitem.match('start ё) english text') is None) - self.assertTrue(regexps_subitem.match('b)') is None) - self.assertTrue(regexps_subitem.match('б)')) - self.assertTrue(regexps_subitem.match('б|') is None) + self.assertTrue(regexps_subitem.match("а) текст на русском")) + self.assertTrue(regexps_subitem.match(" ё) english text")) + self.assertTrue(regexps_subitem.match("start ё) english text") is None) + self.assertTrue(regexps_subitem.match("b)") is None) + self.assertTrue(regexps_subitem.match("б)")) + self.assertTrue(regexps_subitem.match("б|") is None) def test_number_regexp(self) -> None: - self.assertTrue(regexps_number.match('3. ')) - self.assertTrue(regexps_number.match('3.') is None) - self.assertTrue(regexps_number.match(' 3.ф oksdfnn')) - self.assertTrue(regexps_number.match('\t12')) - self.assertTrue(regexps_number.match('123') is None) - self.assertTrue(regexps_number.match('12.34.56.78')) - self.assertTrue(regexps_number.match('12.3.4.5.6.7.8)')) - self.assertTrue(regexps_number.match('12.34}')) - self.assertTrue(regexps_number.match('1.23.4.Z')) - self.assertTrue(regexps_number.match('lorem ipsum 12') is None) + self.assertTrue(regexps_number.match("3. ")) + self.assertTrue(regexps_number.match("3.") is None) + self.assertTrue(regexps_number.match(" 3.ф oksdfnn")) + self.assertTrue(regexps_number.match("\t12")) + self.assertTrue(regexps_number.match("123") is None) + self.assertTrue(regexps_number.match("12.34.56.78")) + self.assertTrue(regexps_number.match("12.3.4.5.6.7.8)")) + self.assertTrue(regexps_number.match("12.34}")) + self.assertTrue(regexps_number.match("1.23.4.Z")) + self.assertTrue(regexps_number.match("lorem ipsum 12") is None) def test_ends_of_number_regexp(self) -> None: - self.assertTrue(regexps_ends_of_number.fullmatch('ё')) - self.assertTrue(regexps_ends_of_number.fullmatch(' ')) - self.assertTrue(regexps_ends_of_number.fullmatch('')) - self.assertTrue(regexps_ends_of_number.fullmatch('abacaba') is None) - self.assertTrue(regexps_ends_of_number.fullmatch('z')) + self.assertTrue(regexps_ends_of_number.fullmatch("ё")) + self.assertTrue(regexps_ends_of_number.fullmatch(" ")) + self.assertTrue(regexps_ends_of_number.fullmatch("")) + self.assertTrue(regexps_ends_of_number.fullmatch("abacaba") is None) + self.assertTrue(regexps_ends_of_number.fullmatch("z")) def test_year_regexp(self) -> None: - self.assertTrue(regexps_year.fullmatch('1998')) - self.assertTrue(regexps_year.fullmatch('1900')) - self.assertTrue(regexps_year.fullmatch('2000')) - self.assertTrue(regexps_year.fullmatch('2021')) - self.assertTrue(regexps_year.fullmatch('2099')) + self.assertTrue(regexps_year.fullmatch("1998")) + self.assertTrue(regexps_year.fullmatch("1900")) + self.assertTrue(regexps_year.fullmatch("2000")) + self.assertTrue(regexps_year.fullmatch("2021")) + self.assertTrue(regexps_year.fullmatch("2099")) diff --git a/tests/unit_tests/test_misc_tasker.py b/tests/unit_tests/test_misc_tasker.py index 64825354..585d14ea 100644 --- a/tests/unit_tests/test_misc_tasker.py +++ b/tests/unit_tests/test_misc_tasker.py @@ -110,7 +110,7 @@ def _test_task_archive(self, task_path: str) -> None: self.assertEqual((1276, 1754), image.size) def test_images_creators(self) -> None: - test_dict = {'english_doc.docx': 3, 'txt_example.txt': 7} + test_dict = {"english_doc.docx": 3, "txt_example.txt": 7} config = get_test_config() config["labeling_mode"] = True path2docs = get_path_original_documents(config) @@ -119,7 +119,7 @@ def test_images_creators(self) -> None: test_manager = DedocManager(manager_config=self.__create_test_manager_config(config), config=config) for doc in os.listdir(files_dir): - if not doc.endswith(('docx', 'txt')): + if not doc.endswith(("docx", "txt")): continue with tempfile.TemporaryDirectory() as tmp_dir: diff --git a/tests/unit_tests/test_misc_toc_feature_extractor.py b/tests/unit_tests/test_misc_toc_feature_extractor.py index 5e0ee4ed..7389c15d 100644 --- a/tests/unit_tests/test_misc_toc_feature_extractor.py +++ b/tests/unit_tests/test_misc_toc_feature_extractor.py @@ -4,7 +4,6 @@ from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.docx_reader.docx_reader import DocxReader from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TOCFeatureExtractor - from tests.test_utils import get_test_config diff --git a/tests/unit_tests/test_misc_tree_node.py b/tests/unit_tests/test_misc_tree_node.py index 35d38dff..6bd936a0 100644 --- a/tests/unit_tests/test_misc_tree_node.py +++ b/tests/unit_tests/test_misc_tree_node.py @@ -2,10 +2,10 @@ from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation -from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata +from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.tree_node import TreeNode -from dedoc.data_structures.hierarchy_level import HierarchyLevel class TestTreeNode(TestCase): diff --git a/tests/unit_tests/test_module_attachment_extractor.py b/tests/unit_tests/test_module_attachment_extractor.py index bc43a0e7..a56c9de2 100644 --- a/tests/unit_tests/test_module_attachment_extractor.py +++ b/tests/unit_tests/test_module_attachment_extractor.py @@ -19,24 +19,24 @@ def test_docx_attachments_extractor(self) -> None: Tests attachment extraction from docx files """ attachments_name_list = [ - '_________Microsoft_Visio.vsdx', - '_________Microsoft_Word.docx', - '_____Microsoft_Excel______________________.xlsm', - 'cats.zip', - 'eiler.json', - 'image4.png', - 'image5.gif', - 'lorem.txt', - 'oleObject1.docx', - 'oleObject2.docx', - 'oleObject1.pdf', - 'test.py' + "_________Microsoft_Visio.vsdx", + "_________Microsoft_Word.docx", + "_____Microsoft_Excel______________________.xlsm", + "cats.zip", + "eiler.json", + "image4.png", + "image5.gif", + "lorem.txt", + "oleObject1.docx", + "oleObject2.docx", + "oleObject1.pdf", + "test.py" ] docx_attachment_extractor = DocxAttachmentsExtractor() extracted = 0 for i in range(1, 4): - filename = f'with_attachments_{i}.docx' + filename = f"with_attachments_{i}.docx" with tempfile.TemporaryDirectory() as tmpdir: shutil.copy(os.path.join(self.src_dir, filename), os.path.join(tmpdir, filename)) @@ -67,7 +67,7 @@ def test_pptx_attachments_extractor(self) -> None: pptx_attachment_extractor = PptxAttachmentsExtractor() extracted = 0 for i in range(1, 3): - filename = f'with_attachments_{i}.pptx' + filename = f"with_attachments_{i}.pptx" with tempfile.TemporaryDirectory() as tmpdir: shutil.copy(os.path.join(self.src_dir, filename), os.path.join(tmpdir, filename)) @@ -82,7 +82,7 @@ def test_pptx_attachments_extractor(self) -> None: def test_docx_diagrams_extraction(self) -> None: docx_attachment_extractor = DocxAttachmentsExtractor() docx_dir = os.path.join(os.path.dirname(__file__), "..", "data", "docx") - files = [('diagram_1.docx', 1), ('diagram_2.docx', 5)] + files = [("diagram_1.docx", 1), ("diagram_2.docx", 5)] with tempfile.TemporaryDirectory() as tmp_dir: for file, num_attachments in files: attachments = docx_attachment_extractor.get_attachments(tmp_dir, os.path.join(docx_dir, file), {}) diff --git a/tests/unit_tests/test_module_builders.py b/tests/unit_tests/test_module_builders.py index d52da744..0a161fed 100644 --- a/tests/unit_tests/test_module_builders.py +++ b/tests/unit_tests/test_module_builders.py @@ -21,20 +21,20 @@ class TestBuilders(unittest.TestCase): composition_builder = HierarchyLevelBuilderComposition(builders=builders) def test_creation_of_builders(self) -> None: - builders = self.composition_builder._get_builders(["header"], 'law') + builders = self.composition_builder._get_builders(["header"], "law") self.assertTrue(isinstance(builders[0], HeaderHierarchyLevelBuilder)) - builders = self.composition_builder._get_builders(["header"], 'foiv') + builders = self.composition_builder._get_builders(["header"], "foiv") self.assertTrue(isinstance(builders[0], HeaderHierarchyLevelBuilder)) - builders = self.composition_builder._get_builders(["application"], 'law') + builders = self.composition_builder._get_builders(["application"], "law") self.assertTrue(isinstance(builders[0], ApplicationLawHierarchyLevelBuilder)) - builders = self.composition_builder._get_builders(["application"], 'foiv') + builders = self.composition_builder._get_builders(["application"], "foiv") self.assertTrue(isinstance(builders[0], ApplicationFoivHierarchyLevelBuilder)) - builders = self.composition_builder._get_builders(["body"], 'foiv') + builders = self.composition_builder._get_builders(["body"], "foiv") self.assertTrue(isinstance(builders[0], BodyFoivHierarchyLevelBuilder)) - builders = self.composition_builder._get_builders(["body"], 'law') + builders = self.composition_builder._get_builders(["body"], "law") self.assertTrue(isinstance(builders[0], BodyLawHierarchyLevelBuilder)) diff --git a/tests/unit_tests/test_module_converter_docx.py b/tests/unit_tests/test_module_converter_docx.py index 5c1fa6fd..d96cebab 100644 --- a/tests/unit_tests/test_module_converter_docx.py +++ b/tests/unit_tests/test_module_converter_docx.py @@ -1,6 +1,6 @@ import os -from dedoc.common.exceptions.conversion_exception import ConversionException +from dedoc.common.exceptions.conversion_error import ConversionError from dedoc.converters.concrete_converters.docx_converter import DocxConverter from tests.unit_tests.abstract_converter_test import AbstractConverterTest @@ -13,7 +13,7 @@ class TestDocxConverter(AbstractConverterTest): def test_convert_broken_file(self) -> None: extension = ".odt" filename = "broken" - with self.assertRaises(ConversionException): + with self.assertRaises(ConversionError): self._convert(filename=filename, extension=extension, converter=self.converter) def test_convert_odt(self) -> None: diff --git a/tests/unit_tests/test_module_converter_excel.py b/tests/unit_tests/test_module_converter_excel.py index 30915222..296ae97f 100644 --- a/tests/unit_tests/test_module_converter_excel.py +++ b/tests/unit_tests/test_module_converter_excel.py @@ -1,6 +1,6 @@ import os -from dedoc.common.exceptions.conversion_exception import ConversionException +from dedoc.common.exceptions.conversion_error import ConversionError from dedoc.converters.concrete_converters.excel_converter import ExcelConverter from tests.unit_tests.abstract_converter_test import AbstractConverterTest @@ -13,7 +13,7 @@ class TestExcelConverter(AbstractConverterTest): def test_convert_broken_file(self) -> None: extension = ".ods" filename = "broken" - with self.assertRaises(ConversionException): + with self.assertRaises(ConversionError): self._convert(filename=filename, extension=extension, converter=self.converter) def test_convert_ods(self) -> None: diff --git a/tests/unit_tests/test_module_converter_ppt.py b/tests/unit_tests/test_module_converter_ppt.py index 9c97515f..391ffc12 100644 --- a/tests/unit_tests/test_module_converter_ppt.py +++ b/tests/unit_tests/test_module_converter_ppt.py @@ -1,6 +1,6 @@ import os -from dedoc.common.exceptions.conversion_exception import ConversionException +from dedoc.common.exceptions.conversion_error import ConversionError from dedoc.converters.concrete_converters.pptx_converter import PptxConverter from tests.unit_tests.abstract_converter_test import AbstractConverterTest @@ -13,7 +13,7 @@ class TestPPTXConverter(AbstractConverterTest): def test_convert_broken_file(self) -> None: extension = ".odp" filename = "broken" - with self.assertRaises(ConversionException): + with self.assertRaises(ConversionError): self._convert(filename=filename, extension=extension, converter=self.converter) def test_convert_odp(self) -> None: diff --git a/tests/unit_tests/test_module_font_classifier.py b/tests/unit_tests/test_module_font_classifier.py index a6c6bf8b..2ef3eb41 100644 --- a/tests/unit_tests/test_module_font_classifier.py +++ b/tests/unit_tests/test_module_font_classifier.py @@ -43,4 +43,4 @@ def test_bold_classification(self) -> None: self.assertIn(BoldAnnotation.name, [annotation.name for annotation in bbox.annotations]) for bbox in page.bboxes[3:]: - self.assertNotIn(BoldAnnotation.name, [annotation.name for annotation in bbox.annotations]) \ No newline at end of file + self.assertNotIn(BoldAnnotation.name, [annotation.name for annotation in bbox.annotations]) diff --git a/tests/unit_tests/test_module_scan_rotator.py b/tests/unit_tests/test_module_scan_rotator.py index 42633ead..8de869dc 100644 --- a/tests/unit_tests/test_module_scan_rotator.py +++ b/tests/unit_tests/test_module_scan_rotator.py @@ -1,9 +1,10 @@ import os import unittest + import cv2 -from tests.test_utils import get_test_config from dedoc.readers.pdf_reader.pdf_image_reader.scan_rotator import ScanRotator +from tests.test_utils import get_test_config class TestScanRotator(unittest.TestCase): diff --git a/tests/unit_tests/test_module_table_detection.py b/tests/unit_tests/test_module_table_detection.py index 405d0bfd..8c6828f1 100644 --- a/tests/unit_tests/test_module_table_detection.py +++ b/tests/unit_tests/test_module_table_detection.py @@ -1,6 +1,7 @@ import os.path import unittest from typing import List + import cv2 import numpy as np @@ -8,7 +9,7 @@ from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.accuracy_table_rec import get_quantitative_parameters from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.utils import equal_with_eps, similarity as utils_similarity -from tests.test_utils import get_test_config, get_full_path +from tests.test_utils import get_full_path, get_test_config def similarity(s1: str, s2: str, threshold: float = 0.8) -> bool: @@ -114,28 +115,27 @@ def test_table_recognition_1(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table3.png"), 0) tables = self.get_table(image) - cnt_ACell, cnt_Cell, cnt_Columns, cnt_Rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) - self.assertEqual(cnt_Rows, 8) - self.assertEqual(cnt_Columns, 3) - self.assertEqual(cnt_ACell, 3) - self.assertEqual(cnt_Cell, 24) + self.assertEqual(cnt_rows, 8) + self.assertEqual(cnt_columns, 3) + self.assertEqual(cnt_a_cell, 3) + self.assertEqual(cnt_cell, 24) self.assertTrue(similarity(tables[0].matrix_cells[0][1].text, "Наименование данных")) self.assertTrue(similarity(tables[0].matrix_cells[0][2].text, "Данные")) self.assertTrue(similarity(tables[0].matrix_cells[4][1].text.capitalize(), "Инн")) - self.assertTrue( - similarity(tables[0].matrix_cells[3][1].text, "Руководитель (ФИО, телефон,\nфакс, электронный адрес)")) + self.assertTrue(similarity(tables[0].matrix_cells[3][1].text, "Руководитель (ФИО, телефон,\nфакс, электронный адрес)")) def test_table_recognition_2(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table4.jpg"), 0) tables = self.get_table(image) - cnt_ACell, cnt_Cell, cnt_Columns, cnt_Rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) - self.assertEqual(cnt_Rows, 5) - self.assertEqual(cnt_Columns, 3) - self.assertEqual(cnt_ACell, 3) - self.assertEqual(cnt_Cell, 15) + self.assertEqual(cnt_rows, 5) + self.assertEqual(cnt_columns, 3) + self.assertEqual(cnt_a_cell, 3) + self.assertEqual(cnt_cell, 15) self.assertTrue(similarity(tables[0].matrix_cells[0][1].text, "Перечень основных данных и\nтребований")) self.assertTrue(similarity(tables[0].matrix_cells[0][2].text, "Основные данные и требования")) self.assertTrue(similarity(tables[0].matrix_cells[3][1].text, "Количество")) @@ -145,12 +145,12 @@ def test_table_recognition_3(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table5.png"), 0) tables = self.get_table(image) - cnt_ACell, cnt_Cell, cnt_Columns, cnt_Rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) - self.assertEqual(cnt_Rows, 13) - self.assertEqual(cnt_Columns, 3) - self.assertEqual(cnt_ACell, 3) - self.assertEqual(cnt_Cell, 39) + self.assertEqual(cnt_rows, 13) + self.assertEqual(cnt_columns, 3) + self.assertEqual(cnt_a_cell, 3) + self.assertEqual(cnt_cell, 39) self.assertTrue(similarity(tables[0].matrix_cells[0][1].text, "Техническая характеристика")) self.assertTrue(similarity(tables[0].matrix_cells[0][2].text, "Показатель")) self.assertTrue(similarity(tables[0].matrix_cells[6][1].text, "Использование крана и его механизмов")) @@ -160,12 +160,12 @@ def test_table_recognition_4(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table5.png"), 0) tables = self.get_table(image) - cnt_ACell, cnt_Cell, cnt_Columns, cnt_Rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) - self.assertEqual(cnt_Rows, 13) - self.assertEqual(cnt_Columns, 3) - self.assertEqual(cnt_ACell, 3) - self.assertEqual(cnt_Cell, 39) + self.assertEqual(cnt_rows, 13) + self.assertEqual(cnt_columns, 3) + self.assertEqual(cnt_a_cell, 3) + self.assertEqual(cnt_cell, 39) self.assertTrue(similarity(tables[0].matrix_cells[0][1].text, "Техническая характеристика")) self.assertTrue(similarity(tables[0].matrix_cells[0][2].text, "Показатель")) self.assertTrue(similarity(tables[0].matrix_cells[6][1].text, "Использование крана и его механизмов")) @@ -175,12 +175,12 @@ def test_table_recognition_with_rotate_5(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table6.png"), 0) tables = self.get_table(image) - cnt_ACell, cnt_Cell, cnt_Columns, cnt_Rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) - self.assertEqual(cnt_Rows, 3) - self.assertEqual(cnt_Columns, 7) - self.assertEqual(cnt_ACell, 7) - self.assertEqual(cnt_Cell, 21) + self.assertEqual(cnt_rows, 3) + self.assertEqual(cnt_columns, 7) + self.assertEqual(cnt_a_cell, 7) + self.assertEqual(cnt_cell, 21) self.assertTrue(similarity(tables[0].matrix_cells[0][1].text, "Группа")) self.assertTrue(similarity(tables[0].matrix_cells[0][3].text, "Наименование")) self.assertTrue(similarity(tables[0].matrix_cells[2][2].text, "Новая\nпозиция")) diff --git a/tests/unit_tests/test_pdf_reader.py b/tests/unit_tests/test_pdf_reader.py new file mode 100644 index 00000000..603a84e6 --- /dev/null +++ b/tests/unit_tests/test_pdf_reader.py @@ -0,0 +1,144 @@ +import os +import shutil +import unittest +from tempfile import TemporaryDirectory +from typing import List +import cv2 +import re + +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier +from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader +from dedoc.readers.pdf_reader.pdf_image_reader.scan_rotator import ScanRotator +from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader +from tests.test_utils import get_test_config + + +class TestPDFReader(unittest.TestCase): + checkpoint_path = get_test_config()["resources_path"] + config = get_test_config() + orientation_classifier = ColumnsOrientationClassifier(on_gpu=False, checkpoint_path=checkpoint_path, delete_lines=True, config=config) + + def _split_lines_on_pages(self, lines: List[LineWithMeta]) -> List[List[str]]: + pages = set(map(lambda x: x.metadata.page_id, lines)) + lines_by_page = [[line.line for line in lines if line.metadata.page_id == page_id] for page_id in pages] + + return lines_by_page + + def test_scan_rotator(self) -> None: + scan_rotator = ScanRotator(config=get_test_config()) + imgs_path = [f'../data/scan_rotator/rotated_{i}.jpg' for i in range(1, 5)] + angles = [0.061732858955328755, -0.017535263190370427, 0.12228411148417097, 0] + + for i in range(len(imgs_path)): + path = os.path.join(os.path.dirname(__file__), imgs_path[i]) + image = cv2.imread(path) + _, orientation = self.orientation_classifier.predict(image) + angle_predict = self.orientation_classifier.classes[2 + orientation] + rotated, angle = scan_rotator.auto_rotate(image, angle_predict) + self.assertAlmostEqual(angle, angles[i], delta=8) + + def test_scan_orientation(self) -> None: + scan_rotator = ScanRotator(config=get_test_config()) + imgs_path = [f'../data/scanned/orient_{i}.png'for i in range(1, 5)] + angles = [90.0, 90.0, 270.0, 270.0] + max_delta = 10.0 + for i in range(len(imgs_path)): + path = os.path.join(os.path.dirname(__file__), imgs_path[i]) + image = cv2.imread(path) + _, angle_predict = self.orientation_classifier.predict(image) + rotated, angle = scan_rotator.auto_rotate(image, angle_predict) + self.assertTrue(abs(angle - angles[i]) < max_delta) + + def test_header_footer_search(self) -> None: + config = get_test_config() + any_doc_reader = PdfTxtlayerReader(config=config) + with TemporaryDirectory() as tmpdir: + filename = "prospectus.pdf" + path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer", filename) + shutil.copy(path, os.path.join(tmpdir, filename)) + result = any_doc_reader.read(os.path.join(tmpdir, filename), + document_type=None, + parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) + + lines_by_page = self._split_lines_on_pages(result.lines) + + headers = [lines[0] for lines in lines_by_page if lines[0] == 'Richelieu Bond \n'] + footers = [lines[-1] for lines in lines_by_page if re.match(r"^\s*-( )*[0-9]+( )*-\s*$", lines[-1])] + + self.assertEqual(len(headers), 0) + self.assertEqual(len(footers), 0) + + def test_header_footer_search_2(self) -> None: + config = get_test_config() + any_doc_reader = PdfTxtlayerReader(config=config) + with TemporaryDirectory() as tmpdir: + filename = "with_changed_header_footer.pdf" + path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer", filename) + shutil.copy(path, os.path.join(tmpdir, filename)) + result = any_doc_reader.read(os.path.join(tmpdir, filename), + document_type=None, + parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) + + lines_by_page = self._split_lines_on_pages(result.lines) + + headers = [lines[0] for lines in lines_by_page + if lines[0] == 'Richelieu Bond \n'] + footers = [lines[-1] for lines in lines_by_page + if re.match(r"^\s*-( )*[0-9]+( )*-\s*$", lines[-1])] + + self.assertEqual(len(headers), 0) + self.assertEqual(len(footers), 0) + + def test_header_footer_search_3(self) -> None: + config = get_test_config() + any_doc_reader = PdfTxtlayerReader(config=config) + with TemporaryDirectory() as tmpdir: + filename = "with_header_footer_2.pdf" + path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer", filename) + shutil.copy(path, os.path.join(tmpdir, filename)) + result = any_doc_reader.read(os.path.join(tmpdir, filename), + document_type=None, + parameters={"need_header_footer_analysis": "True", "need_pdf_table_analysis": "False"}) + + lines_by_page = self._split_lines_on_pages(result.lines) + + headers = [lines[0] for lines in lines_by_page + if lines[0] == 'QUEST MANAGEMENT, SICAV\n'] + footers = [lines[-1] for lines in lines_by_page + if re.match(r"^\s*[0-9]\s*$", lines[-1])] + + self.assertEqual(len(headers), 1) + self.assertEqual(len(footers), 0) + + def test_long_list_pdf(self) -> None: + config = get_test_config() + any_doc_reader = PdfImageReader(config=config) + path = os.path.join(os.path.dirname(__file__), "../data/scanned/doc_with_long_list.pdf") + result = any_doc_reader.read(path, document_type=None, parameters={"need_pdf_table_analysis": "False"}) + list_elements = result.lines[1:] + self.assertEqual(list_elements[0].line.lower().strip(), "1. январь") + self.assertEqual(list_elements[1].line.lower().strip(), "2. февраль") + self.assertEqual(list_elements[2].line.lower().strip(), "3. март") + self.assertEqual(list_elements[3].line.lower().strip(), "4. апрель") + self.assertEqual(list_elements[4].line.lower().strip(), "5. май") + self.assertEqual(list_elements[5].line.lower().strip(), "6. июнь") + self.assertEqual(list_elements[6].line.lower().strip(), "7. июль") + self.assertEqual(list_elements[7].line.lower().strip(), "8. август") + self.assertEqual(list_elements[8].line.lower().strip(), + "9. сентябрь в сентябре, в сентябре много листьев на земле желтые и красные! все такие") + self.assertEqual(list_elements[9].line.lower().strip(), "разные!") + self.assertEqual(list_elements[10].line.lower().strip(), "10. октябрь") + self.assertEqual(list_elements[11].line.lower().strip(), "11. ноябрь") + self.assertEqual(list_elements[12].line.lower().strip(), "12. декабрь") + + def test_pdf_text_layer(self) -> None: + config = get_test_config() + any_doc_reader = PdfTxtlayerReader(config=config) + path = os.path.join(os.path.dirname(__file__), "../data/pdf_with_text_layer/english_doc.pdf") + result = any_doc_reader.read(path, document_type=None, parameters={}) + for line in result.lines: + # check that annotations not duplicated + annotations = line.annotations + annotations_set = {(a.name, a.value, a.start, a.end) for a in annotations} + self.assertEqual(len(annotations_set), len(annotations)) diff --git a/tests/unit_tests/test_tasker.py b/tests/unit_tests/test_tasker.py new file mode 100644 index 00000000..e558c6ee --- /dev/null +++ b/tests/unit_tests/test_tasker.py @@ -0,0 +1,166 @@ +import json +import os +import tempfile +import unittest +import zipfile +from zipfile import ZipFile + +from PIL import Image + +from dedoc.attachments_handler.attachments_handler import AttachmentsHandler +from dedoc.converters.file_converter import FileConverterComposition +from dedoc.manager.dedoc_manager import DedocManager +from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor +from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition +from dedoc.readers.docx_reader.docx_reader import DocxReader +from dedoc.readers.reader_composition import ReaderComposition +from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor +from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition +from dedoc.structure_extractors.concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor +from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor +from dedoc.structure_extractors.concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor +from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor +from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition +from dedoc.train_dataset.train_dataset_utils import get_path_original_documents +from dedoc.readers.txt_reader.raw_text_reader import RawTextReader +from tests.test_utils import get_test_config +from dedoc.train_dataset.taskers.concrete_taskers.line_label_tasker import LineLabelTasker +from dedoc.train_dataset.taskers.images_creators.concrete_creators.docx_images_creator import DocxImagesCreator +from dedoc.train_dataset.taskers.images_creators.concrete_creators.txt_images_creator import TxtImagesCreator +from dedoc.train_dataset.taskers.tasker import Tasker + + +class TestTasker(unittest.TestCase): + base_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "taskers")) + path2bboxes = os.path.join(base_path, "bboxes.jsonlines") + path2lines = os.path.join(base_path, "lines.jsonlines") + path2docs = os.path.join(base_path, "images") + manifest_path = os.path.join(base_path, "test_manifest.md") + config_path = os.path.join(base_path, "test_config.json") + + def test_paths(self) -> None: + self.assertTrue(os.path.isfile(self.path2bboxes), self.path2bboxes) + self.assertTrue(os.path.isfile(self.path2lines), self.path2lines) + self.assertTrue(os.path.isfile(self.manifest_path), self.manifest_path) + self.assertTrue(os.path.isfile(self.config_path), self.config_path) + self.assertTrue(os.path.isdir(self.path2docs), self.path2docs) + + def test_line_label_tasker_size1(self) -> None: + tasker = self._get_line_label_classifier() + task_cnt = 0 + for task_path in tasker.create_tasks(task_size=10): + task_cnt += 1 + self._test_task_archive(task_path) + + def test_line_label_tasker_size2(self) -> None: + tasker = self._get_line_label_classifier() + + task_cnt = 0 + for task_path in tasker.create_tasks(task_size=2): + task_cnt += 1 + self._test_task_archive(task_path) + + def test_tasker(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + taskers = {"law_classifier": LineLabelTasker( + path2bboxes=self.path2bboxes, + path2lines=self.path2lines, + path2docs=self.path2docs, + manifest_path=self.manifest_path, + config_path=self.config_path, + tmp_dir=tmpdir, + config=get_test_config() + )} + tasker = Tasker(boxes_label_path=self.path2bboxes, + line_info_path=self.path2lines, + images_path=self.path2docs, + save_path=tmpdir, + concrete_taskers=taskers, + config=get_test_config()) + tasks_path, task_size = tasker.create_tasks(type_of_task="law_classifier", task_size=1) + self.assertTrue(os.path.isfile(tasks_path)) + self.assertEqual(1, task_size) + with ZipFile(tasks_path) as archive: + self.assertIn("original_documents.zip", archive.namelist()) + + def _get_line_label_classifier(self) -> LineLabelTasker: + config = get_test_config() + tasker = LineLabelTasker( + path2bboxes=self.path2bboxes, + path2lines=self.path2lines, + path2docs=self.path2docs, + manifest_path=self.manifest_path, + config_path=self.config_path, + tmp_dir="/tmp/tasker_test", + config=config + ) + return tasker + + def _test_task_archive(self, task_path: str) -> None: + self.assertTrue(os.path.isfile(task_path)) + with ZipFile(task_path) as archive: + namelist = [name.split("/", maxsplit=1)[-1] for name in archive.namelist()] + self.assertIn("test_config.json", namelist) + self.assertIn("test_manifest.md", namelist) + images_paths = [image for image in archive.namelist() if "_img_bbox_" in image] + self.assertTrue(len(images_paths) > 0) + for image_path in images_paths: + with archive.open(image_path) as image_file: + image = Image.open(image_file) + self.assertEqual((1276, 1754), image.size) + + def test_images_creators(self) -> None: + test_dict = {'english_doc.docx': 3, 'txt_example.txt': 7} + config = get_test_config() + config["labeling_mode"] = True + path2docs = get_path_original_documents(config) + files_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "images_creator")) + images_creators = [DocxImagesCreator(path2docs, config=get_test_config()), TxtImagesCreator(path2docs, config=get_test_config())] + + test_manager = DedocManager.from_config(version="0", manager_config=self.__create_test_manager_config(config), config=config) + for doc in os.listdir(files_dir): + if not doc.endswith(('docx', 'txt')): + continue + + with tempfile.TemporaryDirectory() as tmp_dir: + with zipfile.ZipFile(os.path.join(tmp_dir, "archive.zip"), "w") as archive: + _ = test_manager.parse_file(file_path=os.path.join(files_dir, doc), + parameters=dict(document_type="law"), + original_file_name=doc) + lines_path = os.path.join(config["intermediate_data_path"], "lines.jsonlines") + self.assertTrue(os.path.isfile(lines_path)) + with open(lines_path, "r") as f: + lines = [json.loads(line) for line in f] + original_doc = lines[0]["original_document"] + path = os.path.join(get_path_original_documents(config), original_doc) + self.assertTrue(os.path.isfile(path)) + for images_creator in images_creators: + if images_creator.can_read(lines): + images_creator.add_images(page=lines, archive=archive) + break + self.assertEqual(len(archive.namelist()), test_dict[doc]) + os.remove(path) + os.remove(lines_path) + + config.pop("labeling_mode") + + def __create_test_manager_config(self, config: dict) -> dict: + readers = [DocxReader(config=config), RawTextReader(config=config)] + metadata_extractors = [BaseMetadataExtractor()] + law_extractors = { + FoivLawStructureExtractor.document_type: FoivLawStructureExtractor(config=config), + LawStructureExtractor.document_type: LawStructureExtractor(config=config) + } + structure_extractors = { + DefaultStructureExtractor.document_type: DefaultStructureExtractor(), + ClassifyingLawStructureExtractor.document_type: ClassifyingLawStructureExtractor(extractors=law_extractors, config=config) + } + + return dict( + converter=FileConverterComposition(converters=[]), + reader=ReaderComposition(readers=readers), + structure_extractor=StructureExtractorComposition(extractors=structure_extractors, default_key="other"), + structure_constructor=StructureConstructorComposition(default_constructor=TreeConstructor(), constructors={"tree": TreeConstructor()}), + document_metadata_extractor=MetadataExtractorComposition(extractors=metadata_extractors), + attachments_extractor=AttachmentsHandler(config=config) + ) diff --git a/tests/unit_tests/test_txt_law_reader.py b/tests/unit_tests/test_txt_law_reader.py new file mode 100644 index 00000000..7da510cc --- /dev/null +++ b/tests/unit_tests/test_txt_law_reader.py @@ -0,0 +1,38 @@ +import os + +from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor +from dedoc.readers.txt_reader.raw_text_reader import RawTextReader +from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor +from tests.api_tests.abstract_api_test import AbstractTestApiDocReader +from tests.test_utils import get_test_config + + +class TestLawTxtReader(AbstractTestApiDocReader): + config = get_test_config() + txt_reader = RawTextReader(config=config) + metadata_extractor = BaseMetadataExtractor() + law_extractor = LawStructureExtractor(config=config) + + def _get_abs_path(self, file_name: str) -> str: + return os.path.join(self.data_directory_path, "laws", file_name) + + def test_spaces(self) -> None: + path = self._get_abs_path("коап_москвы_8_7_2015_utf.txt") + directory, filename = os.path.split(path) + document = self.txt_reader.read(path=path, document_type="law", parameters={}) + document = self.metadata_extractor.add_metadata(document, directory, filename, filename, filename, "") + document = self.law_extractor.extract_structure(document, {}) + + self.assertListEqual([], document.attachments) + self.assertListEqual([], document.tables) + lines = document.lines + self.assertEqual("\n", lines[0].line) + self.assertEqual(0, lines[0].metadata.line_id) + self.assertEqual(" \n", lines[1].line) + self.assertEqual(1, lines[1].metadata.line_id) + self.assertEqual("\n", lines[2].line) + self.assertEqual(2, lines[2].metadata.line_id) + self.assertEqual(" \n", lines[3].line) + self.assertEqual(3, lines[3].metadata.line_id) + self.assertEqual("ЗАКОН\n", lines[4].line) + self.assertEqual(4, lines[4].metadata.line_id)