Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tabby bbox annotations #309

Closed
wants to merge 22 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
26 changes: 26 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[flake8]

max-line-length = 160
max-complexity = 13
inline-quotes = "

application-import-names = dedoc, tests
import-order-style = pycharm

exclude =
.git,
__pycache__,
.idea,
.github,
*__init__.py,
resources,
dedoc/scripts,
examples,
docs,
venv,
build,
dedoc.egg-info

# ANN101 - type annotations for self
ignore =
ANN101
6 changes: 3 additions & 3 deletions .github/workflows/test_on_push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: '3.8'
- name: Install dependencies
- name: Run lint
run: |
python3 -m pip install --upgrade pip
pip3 install pycodestyle==2.7.0 flake8==3.9.2 flake8-annotations==2.6.2 pyflakes==2.3.1
pip3 install .[lint]
flake8 .
- name: Run tests
run: |
python3 -m unittest -v -f tests/test_style.py
test="true" docker-compose up --build --exit-code-from test
20 changes: 20 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
repos:
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
hooks:
- id: flake8
exclude: \.github|.*__init__\.py|resources|dedoc/scripts|examples|docs|venv|build|dedoc\.egg-info
args:
- "--config=.flake8"
additional_dependencies: [
flake8-absolute-import==1.0.0.1,
flake8-annotations==2.9.1,
flake8-bugbear==23.3.12,
flake8-builtins==2.1.0,
flake8-import-order==0.18.2,
flake8-print==5.0.0,
flake8-quotes==3.3.2,
flake8-use-fstring==1.4,
pycodestyle==2.9.0,
pep8-naming==0.13.3
]
108 changes: 54 additions & 54 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# noqa
from typing import Any, Optional

from fastapi import Body
from pydantic import BaseModel

Expand Down Expand Up @@ -36,68 +36,68 @@ class QueryParameters(BaseModel):

def __init__(self,
# type of document structure parsing
document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None),
structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None),
return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None),
document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None), # noqa
structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None), # noqa
return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None), # noqa

# attachments handling
with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None),
need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None),
recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None),
return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None),
attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None),
with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None), # noqa
need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None), # noqa
recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None), # noqa
return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None), # noqa
attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None), # noqa

# tables handling
insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None),
need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None),
table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None),
orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None),
orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None),
insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None), # noqa
need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None), # noqa
table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None), # noqa
orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None), # noqa
orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None), # noqa

# pdf handling
pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None),
language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None),
pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None),
is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None),
document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None),
need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None),
need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None),
pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None), # noqa
language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None), # noqa
pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None), # noqa
is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None), # noqa
document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None), # noqa
need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None), # noqa
need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None), # noqa

# other formats handling
delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None),
encoding: Optional[str] = Body(description="a document encoding", default=None),
html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None),
handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None),
delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None), # noqa
encoding: Optional[str] = Body(description="a document encoding", default=None), # noqa
html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None), # noqa
handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None), # noqa


**data: Any) -> None:
**data: Any) -> None: # noqa

super().__init__(**data)
self.document_type: str = document_type or ""
self.structure_type: str = structure_type or 'tree'
self.return_format: str = return_format or 'json'

self.with_attachments: str = with_attachments or 'false'
self.need_content_analysis: str = need_content_analysis or 'false'
self.recursion_deep_attachments: str = recursion_deep_attachments or '10'
self.return_base64: str = return_base64 or 'false'
self.attachments_dir: str = attachments_dir

self.insert_table: str = insert_table or 'false'
self.need_pdf_table_analysis: str = need_pdf_table_analysis or 'true'
self.table_type: str = table_type or ''
self.orient_analysis_cells: str = orient_analysis_cells or 'false'
self.orient_cell_angle: str = orient_cell_angle or "90"

self.pdf_with_text_layer: str = pdf_with_text_layer or 'auto_tabby'
self.language: str = language or "rus+eng"
self.pages: str = pages or ':'
self.is_one_column_document: str = is_one_column_document or 'auto'
self.document_orientation: str = document_orientation or "auto"
self.need_header_footer_analysis: str = need_header_footer_analysis or 'false'
self.need_binarization: str = need_binarization or 'false'

self.delimiter: str = delimiter
self.encoding: str = encoding
self.html_fields: str = html_fields or ''
self.handle_invisible_table: str = handle_invisible_table or 'false'
self.document_type: str = document_type or ""
self.structure_type: str = structure_type or "tree"
self.return_format: str = return_format or "json"

self.with_attachments: str = with_attachments or "false"
self.need_content_analysis: str = need_content_analysis or "false"
self.recursion_deep_attachments: str = recursion_deep_attachments or "10"
self.return_base64: str = return_base64 or "false"
self.attachments_dir: str = attachments_dir

self.insert_table: str = insert_table or "false"
self.need_pdf_table_analysis: str = need_pdf_table_analysis or "true"
self.table_type: str = table_type or ""
self.orient_analysis_cells: str = orient_analysis_cells or "false"
self.orient_cell_angle: str = orient_cell_angle or "90"

self.pdf_with_text_layer: str = pdf_with_text_layer or "auto_tabby"
self.language: str = language or "rus+eng"
self.pages: str = pages or ":"
self.is_one_column_document: str = is_one_column_document or "auto"
self.document_orientation: str = document_orientation or "auto"
self.need_header_footer_analysis: str = need_header_footer_analysis or "false"
self.need_binarization: str = need_binarization or "false"

self.delimiter: str = delimiter
self.encoding: str = encoding
self.html_fields: str = html_fields or ""
self.handle_invisible_table: str = handle_invisible_table or "false"
Loading
Loading