diff --git a/.flake8 b/.flake8 index 401f544b..c0511db7 100644 --- a/.flake8 +++ b/.flake8 @@ -28,6 +28,7 @@ exclude = *__init__.py, resources, venv, + .venv, build, dedoc.egg-info, docs/_build, @@ -48,5 +49,5 @@ per-file-ignores = scripts/*:T201 scripts/benchmark_pdf_performance*:JS101 tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802 - docs/source/_static/code_examples/*:I251 + docs/source/_static/code_examples/*:I251,T201 docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251 diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 4cb468e8..262c3ba5 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -33,3 +33,4 @@ jobs: python dedoc_usage_tutorial.py python dedoc_add_new_doc_type_tutorial.py python dedoc_add_new_structure_type_tutorial.py + python dedoc_using_patterns_tutorial.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 09231202..2b5eae7a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: rev: 5.0.4 hooks: - id: flake8 - exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py + exclude: \.github|.*__init__\.py|resources|docs|venv|\.venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py args: - "--config=.flake8" additional_dependencies: [ diff --git a/README.md b/README.md index f4a8c726..a14ad28b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # Dedoc +[![Telegram](https://img.shields.io/badge/chat-on%20Telegram-2ba2d9.svg)](https://t.me/dedoc_chat) [![image](https://img.shields.io/pypi/pyversions/dedoc.svg)](https://pypi.python.org/pypi/dedoc) [![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/) [![PyPI version](https://badge.fury.io/py/dedoc.svg)](https://badge.fury.io/py/dedoc) @@ -94,6 +95,12 @@ Relevant documentation of dedoc is available [here](https://dedoc.readthedocs.io * Article on habr.com [Dedoc: как автоматически извлечь из текстового документа всё и даже немного больше](https://habr.com/ru/companies/isp_ras/articles/779390/) in Russian (2023) * Article [Dedoc: A Universal System for Extracting Content and Logical Structure From Textual Documents](https://ieeexplore.ieee.org/abstract/document/10508151/) in English (2023) +# Join Our Community + +Have questions or want to discuss Dedoc? Join our [Telegram chat](https://t.me/dedoc_chat) and connect with the community and the developers. + +Join our [Telegram channel](https://t.me/dedoc_channel) to get notifications about the most recent updates. + # Installation instructions This project has a REST api and you can run it in Docker container. diff --git a/VERSION b/VERSION index b539adea..c0943d3e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.7 \ No newline at end of file +2.3 \ No newline at end of file diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index 8f3e1415..d1f7d5cf 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -8,6 +8,7 @@ class QueryParameters: # type of document structure parsing document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain") + patterns: str = Form("", description='Patterns for default document type (when document_type="")') structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type") return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"], description="Response representation, most types (except json) are used for debug purposes only") @@ -39,6 +40,7 @@ class QueryParameters: '"no_change" - set vertical orientation of the document without using an orientation classifier') need_header_footer_analysis: str = Form("false", enum=["true", "false"], description="Exclude headers and footers from PDF parsing result") need_binarization: str = Form("false", enum=["true", "false"], description="Binarize document pages (for images or PDF without a textual layer)") + need_gost_frame_analysis: str = Form("false", enum=["true", "false"], description="Parameter for detecting and ignoring GOST frame of the document") # other formats handling delimiter: Optional[str] = Form(None, description="Column separator for CSV files") diff --git a/dedoc/api/schema/annotation.py b/dedoc/api/schema/annotation.py index 9add75dd..225de396 100644 --- a/dedoc/api/schema/annotation.py +++ b/dedoc/api/schema/annotation.py @@ -5,6 +5,16 @@ class Annotation(BaseModel): """ The piece of information about the text line: it's appearance or links to another document object. For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was written in italic. + + :ivar start: start of the annotated text + :ivar end: end of the annotated text (end isn't included) + :ivar name: annotation's name, specific for each type of annotation + :ivar value: information about annotated text, depends on the type of annotation, e.g. "True"/"False", "10.0", etc. + + :vartype start: int + :vartype end: int + :vartype name: str + :vartype value: str """ start: int = Field(description="Start of the annotated text", example=0) end: int = Field(description="End of the annotated text (end isn't included)", example=5) diff --git a/dedoc/api/schema/cell_with_meta.py b/dedoc/api/schema/cell_with_meta.py index efeb0fdf..05cb6f66 100644 --- a/dedoc/api/schema/cell_with_meta.py +++ b/dedoc/api/schema/cell_with_meta.py @@ -8,6 +8,16 @@ class CellWithMeta(BaseModel): """ Holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible). + + :ivar lines: list of textual lines of the cell + :ivar colspan: number of columns to span (for cells merged horizontally) + :ivar rowspan: number of rows to span (for cells merged vertically) + :ivar invisible: indicator for displaying or hiding cell text - cells that are merged with others are hidden (for HTML display) + + :vartype lines: List[LineWithMeta] + :vartype colspan: int + :vartype rowspan: int + :vartype invisible: bool """ lines: List[LineWithMeta] = Field(description="Textual lines of the cell with annotations") rowspan: int = Field(description="Number of rows to span like in HTML format", example=1) diff --git a/dedoc/api/schema/document_content.py b/dedoc/api/schema/document_content.py index 5127650e..e9d8a47c 100644 --- a/dedoc/api/schema/document_content.py +++ b/dedoc/api/schema/document_content.py @@ -9,6 +9,12 @@ class DocumentContent(BaseModel): """ Content of the document - structured text and tables. + + :ivar tables: list of document tables + :ivar structure: tree structure of the document nodes with text and additional metadata + + :vartype tables: List[Table] + :vartype structure: TreeNode """ structure: TreeNode = Field(description="Tree structure where content of the document is organized") tables: List[Table] = Field(description="List of document tables") diff --git a/dedoc/api/schema/document_metadata.py b/dedoc/api/schema/document_metadata.py index 4d814fc3..fb45c075 100644 --- a/dedoc/api/schema/document_metadata.py +++ b/dedoc/api/schema/document_metadata.py @@ -4,6 +4,26 @@ class DocumentMetadata(BaseModel): """ Document metadata like its name, size, author, etc. + + :ivar file_name: original document name (before rename and conversion, so it can contain non-ascii symbols, spaces and so on) + :ivar temporary_file_name: file name during parsing (unique name after rename and conversion) + :ivar size: size of the original file in bytes + :ivar modified_time: time of the last modification in unix time format (seconds since the epoch) + :ivar created_time: time of the creation in unixtime + :ivar access_time: time of the last access to the file in unixtime + :ivar file_type: mime type of the file + :ivar uid: document unique identifier (useful for attached files) + + :vartype file_name: str + :vartype temporary_file_name: str + :vartype size: int + :vartype modified_time: int + :vartype created_time: int + :vartype access_time: int + :vartype file_type: str + :vartype uid: str + + Additional variables may be added with other file metadata. """ class Config: extra = Extra.allow diff --git a/dedoc/api/schema/line_metadata.py b/dedoc/api/schema/line_metadata.py index 37e893d8..e123f28d 100644 --- a/dedoc/api/schema/line_metadata.py +++ b/dedoc/api/schema/line_metadata.py @@ -6,10 +6,20 @@ class LineMetadata(BaseModel): """ Holds information about document node/line metadata, such as page number or line type. + + :ivar paragraph_type: type of the document line/paragraph (header, list_item, list, etc.) + :ivar page_id: page number where paragraph starts, the numeration starts from page 0 + :ivar line_id: line number inside the entire document, the numeration starts from line 0 + + :vartype paragraph_type: str + :vartype page_id: int + :vartype line_id: Optional[int] + + Additional variables may be added with other line metadata. """ class Config: extra = Extra.allow - paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text") + paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list, etc.)", example="raw_text") page_id: int = Field(description="Page number of the line/paragraph beginning", example=0) line_id: Optional[int] = Field(description="Line number", example=1) diff --git a/dedoc/api/schema/line_with_meta.py b/dedoc/api/schema/line_with_meta.py index 1c155ab5..a8f61b1d 100644 --- a/dedoc/api/schema/line_with_meta.py +++ b/dedoc/api/schema/line_with_meta.py @@ -8,6 +8,12 @@ class LineWithMeta(BaseModel): """ Textual line with text annotations. + + :ivar text: text of the line + :ivar annotations: text annotations (font, size, bold, italic, etc.) + + :vartype text: str + :vartype annotations: List[Annotation] """ text: str = Field(description="Text of the line", example="Some text") - annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic and etc)") + annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic, etc.)") diff --git a/dedoc/api/schema/parsed_document.py b/dedoc/api/schema/parsed_document.py index 076540a4..d4a7d846 100644 --- a/dedoc/api/schema/parsed_document.py +++ b/dedoc/api/schema/parsed_document.py @@ -9,6 +9,18 @@ class ParsedDocument(BaseModel): """ Holds information about the document content, metadata and attachments. + + :ivar content: document text (hierarchy of nodes) and tables + :ivar attachments: result of analysis of attached files (empty if with_attachments=False) + :ivar metadata: document metadata such as size, creation date and so on. + :ivar warnings: list of warnings and possible errors, arising in the process of document parsing + :ivar version: version of the program that parsed this document + + :vartype content: DocumentContent + :vartype attachments: List[ParsedDocument] + :vartype metadata: DocumentMetadata + :vartype warnings: List[str] + :vartype version: str """ content: DocumentContent = Field(description="Document text and tables") metadata: DocumentMetadata = Field(description="Document metadata such as size, creation date and so on") diff --git a/dedoc/api/schema/table.py b/dedoc/api/schema/table.py index 52b2b59c..e834f1bf 100644 --- a/dedoc/api/schema/table.py +++ b/dedoc/api/schema/table.py @@ -11,6 +11,12 @@ class Table(BaseModel): Holds information about tables in the document. We assume that a table has rectangle form (has the same number of columns in each row). Table representation is row-based i.e. external list contains list of rows. + + :ivar metadata: a list of lists of table cells (cell has text lines, colspan and rowspan attributes) + :ivar cells: table metadata as location, title and so on + + :vartype metadata: TableMetadata + :vartype cells: List[List[CellWithMeta]] """ cells: List[List[CellWithMeta]] = Field(description="List of lists of table cells (cell has text, colspan and rowspan attributes)") metadata: TableMetadata = Field(description="Table meta information") diff --git a/dedoc/api/schema/table_metadata.py b/dedoc/api/schema/table_metadata.py index 53299a16..b75dbc21 100644 --- a/dedoc/api/schema/table_metadata.py +++ b/dedoc/api/schema/table_metadata.py @@ -6,6 +6,16 @@ class TableMetadata(BaseModel): """ Holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on. + + :ivar page_id: number of the page where table starts + :ivar uid: unique identifier of the table (used for linking table to text) + :ivar rotated_angle: value of the rotation angle by which the table was rotated during recognition + :ivar title: table's title + + :vartype page_id: Optional[int] + :vartype uid: str + :vartype rotated_angle: float + :vartype title: str """ page_id: Optional[int] = Field(description="Number of the page where the table starts", example=0) uid: str = Field(description="Unique identifier of the table", example="e8ba5523-8546-4804-898c-2f4835a1804f") diff --git a/dedoc/api/schema/tree_node.py b/dedoc/api/schema/tree_node.py index 5eeedd42..2aeeccae 100644 --- a/dedoc/api/schema/tree_node.py +++ b/dedoc/api/schema/tree_node.py @@ -10,6 +10,18 @@ class TreeNode(BaseModel): """ Helps to represent document as recursive tree structure. It has list of children `TreeNode` nodes (empty list for a leaf node). + + :ivar node_id: unique node identifier + :ivar text: text of the node (may contain several lines) + :ivar annotations: some metadata related to the part of the text (as font size) + :ivar metadata: metadata refers to entire node (as node type) + :ivar subparagraphs: list of child of this node + + :vartype node_id: str + :vartype text: str + :vartype annotations: List[Annotation] + :vartype metadata: LineMetadata + :vartype subparagraphs: List[TreeNode] """ node_id: str = Field(description="Document element identifier. It is unique within a document content tree. " "The identifier consists of numbers separated by dots where each number " diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html index 423dbcfe..4362832a 100644 --- a/dedoc/api/web/index.html +++ b/dedoc/api/web/index.html @@ -28,7 +28,7 @@
+
+