Skip to content

Commit

Permalink
update master (#451)
Browse files Browse the repository at this point in the history
Co-authored-by: Andrew Perminov <[email protected]>
  • Loading branch information
NastyBoget and dronperminov authored Jun 5, 2024
1 parent 6b84563 commit 370f6ef
Show file tree
Hide file tree
Showing 22 changed files with 672 additions and 111 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.2.2
2.2.3
1 change: 0 additions & 1 deletion dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ class QueryParameters:
need_content_analysis: str = Form("false", enum=["true", "false"], description="Enable parsing contents of the attached files")
recursion_deep_attachments: str = Form("10", description="Depth on which nested attachments will be parsed if need_content_analysis=true")
return_base64: str = Form("false", enum=["true", "false"], description="Save attached images to the document metadata in base64 format")
attachments_dir: Optional[str] = Form(None, description="Path to the directory where to save files' attachments")

# tables handling
need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf")
Expand Down
23 changes: 16 additions & 7 deletions dedoc/api/api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from dedoc.data_structures.parsed_document import ParsedDocument
from dedoc.data_structures.table import Table
from dedoc.data_structures.tree_node import TreeNode
from dedoc.extensions import converted_mimes, recognized_mimes


def __prettify_text(text: str) -> Iterator[str]:
Expand Down Expand Up @@ -148,11 +149,22 @@ def json2html(text: str,
text += table2html(table, table2id)
text += "<p>&nbsp;</p>"

image_mimes = recognized_mimes.image_like_format.union(converted_mimes.image_like_format)

if attachments is not None and len(attachments) > 0:
text += "<h3> Attachments: </h3>"
for attachment_id, attachment in enumerate(attachments):
attachment_text = json2html(text="", paragraph=attachment.content.structure, tables=attachment.content.tables, attachments=attachment.attachments)
text += f'<div id="{attachment.metadata.uid}"><h4>attachment {attachment_id} ({attachment.metadata.file_name}):</h4>{attachment_text}</div>'
attachment_base64 = f'data:{attachment.metadata.file_type};base64,{attachment.metadata.base64}"'
attachment_link = f'<a href="{attachment_base64}" download="{attachment.metadata.file_name}">{attachment.metadata.file_name}</a>'
is_image = attachment.metadata.file_type in image_mimes
attachment_image = f'<img src="{attachment_base64}">' if is_image else ""

text += f"""<div id="{attachment.metadata.uid}">
<h4>attachment {attachment_id} ({attachment_link}):</h4>
{attachment_image}
{attachment_text}
</div>"""

return text

Expand Down Expand Up @@ -193,12 +205,9 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int], attach2id:
name = annotation.name
value = annotation.value

bool_annotations = [BoldAnnotation.name,
ItalicAnnotation.name,
StrikeAnnotation.name,
SubscriptAnnotation.name,
SuperscriptAnnotation.name,
UnderlinedAnnotation.name]
bool_annotations = [
BoldAnnotation.name, ItalicAnnotation.name, StrikeAnnotation.name, SubscriptAnnotation.name, SuperscriptAnnotation.name, UnderlinedAnnotation.name
]
check_annotations = bool_annotations + [TableAnnotation.name, ReferenceAnnotation.name, AttachAnnotation.name]
if name not in check_annotations and not value.startswith("heading "):
continue
Expand Down
39 changes: 28 additions & 11 deletions dedoc/api/dedoc_api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import base64
import dataclasses
import importlib
import json
Expand Down Expand Up @@ -62,41 +63,57 @@ def _get_static_file_path(request: Request) -> str:
return os.path.abspath(os.path.join(directory, file))


def __add_base64_info_to_attachments(document_tree: ParsedDocument, attachments_dir: str) -> None:
for attachment in document_tree.attachments:
with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file:
attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8"))


@app.post("/upload", response_model=ParsedDocument)
async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: # noqa
parameters = dataclasses.asdict(query_params)
if not file or file.filename == "":
raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.__version__)

return_format = str(parameters.get("return_format", "json")).lower()

with tempfile.TemporaryDirectory() as tmpdir:
file_path = save_upload_file(file, tmpdir)
document_tree = manager.parse(file_path, parameters=dict(parameters))
document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmpdir})

if return_format == "html":
__add_base64_info_to_attachments(document_tree, tmpdir)

return_format = str(parameters.get("return_format", "json")).lower()
if return_format == "html":
html_content = json2html(
text="",
paragraph=document_tree.content.structure,
tables=document_tree.content.tables,
attachments=document_tree.attachments, tabs=0
attachments=document_tree.attachments,
tabs=0
)
return HTMLResponse(content=html_content)
elif return_format == "plain_text":

if return_format == "plain_text":
txt_content = json2txt(paragraph=document_tree.content.structure)
return PlainTextResponse(content=txt_content)
elif return_format == "tree":

if return_format == "tree":
html_content = json2tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content)
elif return_format == "ujson":

if return_format == "ujson":
return UJSONResponse(content=document_tree.to_api_schema().model_dump())
elif return_format == "collapsed_tree":

if return_format == "collapsed_tree":
html_content = json2collapsed_tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content)
elif return_format == "pretty_json":

if return_format == "pretty_json":
return PlainTextResponse(content=json.dumps(document_tree.to_api_schema().model_dump(), ensure_ascii=False, indent=2))
else:
logger.info(f"Send result. File {file.filename} with parameters {parameters}")
return ORJSONResponse(content=document_tree.to_api_schema().model_dump())

logger.info(f"Send result. File {file.filename} with parameters {parameters}")
return ORJSONResponse(content=document_tree.to_api_schema().model_dump())


@app.get("/upload_example")
Expand Down
6 changes: 1 addition & 5 deletions dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ <h4>Type of document structure parsing</h4>

<div class="parameters">
<h4>Attachments handling</h4>
<details><summary>with_attachments, need_content_analysis, recursion_deep_attachments, return_base64, attachments_dir</summary>
<details><summary>with_attachments, need_content_analysis, recursion_deep_attachments, return_base64</summary>
<br>
<p>
<label><input name="with_attachments" type="checkbox" value="true"> with_attachments </label>
Expand All @@ -87,10 +87,6 @@ <h4>Attachments handling</h4>
<p>
<label><input name="return_base64" type="checkbox" value="true"> return_base64 </label>
</p>

<p>
<label>attachments_dir <input name="attachments_dir" type="text" size="35" value=""></label>
</p>
</details>
</div>

Expand Down
7 changes: 5 additions & 2 deletions dedoc/data_structures/document_metadata.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import uuid
from typing import Dict, Union
from typing import Any, Dict, Union

from dedoc.api.schema.document_metadata import DocumentMetadata as ApiDocumentMetadata
from dedoc.data_structures.serializable import Serializable
Expand Down Expand Up @@ -38,8 +38,11 @@ def __init__(self,
self.access_time = access_time
self.file_type = file_type
for key, value in kwargs.items():
setattr(self, key, value)
self.add_attribute(key, value)
self.uid = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid

def add_attribute(self, key: str, value: Any) -> None: # noqa
setattr(self, key, value)

def to_api_schema(self) -> ApiDocumentMetadata:
return ApiDocumentMetadata(**vars(self))
39 changes: 10 additions & 29 deletions dedoc/readers/docx_reader/data_structures/docx_document.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
import hashlib
import logging
import os
import re
import zipfile
from collections import defaultdict
from typing import List, Optional
from typing import List

from bs4 import BeautifulSoup, Tag

from dedoc.common.exceptions.bad_file_error import BadFileFormatError
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation
from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation
Expand All @@ -19,6 +16,7 @@
from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter
from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
from dedoc.readers.docx_reader.styles_extractor import StylesExtractor
from dedoc.utils.office_utils import get_bs_from_zip
from dedoc.utils.utils import calculate_file_hash


Expand All @@ -28,8 +26,8 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L
self.path = path
self.attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments}

self.document_bs_tree = self.__get_bs_tree("word/document.xml")
self.document_bs_tree = self.__get_bs_tree("word/document2.xml") if self.document_bs_tree is None else self.document_bs_tree
self.document_bs_tree = get_bs_from_zip(self.path, "word/document.xml")
self.document_bs_tree = get_bs_from_zip(self.path, "word/document2.xml") if self.document_bs_tree is None else self.document_bs_tree
self.body = self.document_bs_tree.body if self.document_bs_tree else None
self.paragraph_maker = self.__get_paragraph_maker()

Expand All @@ -39,8 +37,8 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L
self.lines = self.__get_lines()

def __get_paragraph_maker(self) -> ParagraphMaker:
styles_extractor = StylesExtractor(self.__get_bs_tree("word/styles.xml"), self.logger)
num_tree = self.__get_bs_tree("word/numbering.xml")
styles_extractor = StylesExtractor(get_bs_from_zip(self.path, "word/styles.xml"), self.logger)
num_tree = get_bs_from_zip(self.path, "word/numbering.xml")
numbering_extractor = NumberingExtractor(num_tree, styles_extractor) if num_tree else None
styles_extractor.numbering_extractor = numbering_extractor

Expand All @@ -49,8 +47,8 @@ def __get_paragraph_maker(self) -> ParagraphMaker:
path_hash=calculate_file_hash(path=self.path),
styles_extractor=styles_extractor,
numbering_extractor=numbering_extractor,
footnote_extractor=FootnoteExtractor(self.__get_bs_tree("word/footnotes.xml")),
endnote_extractor=FootnoteExtractor(self.__get_bs_tree("word/endnotes.xml"), key="endnote")
footnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")),
endnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote")
)

def __get_lines(self) -> List[LineWithMeta]:
Expand Down Expand Up @@ -120,23 +118,6 @@ def __paragraphs2lines(self, image_refs: dict, table_refs: dict, diagram_refs: d

return lines_with_meta

def __get_bs_tree(self, filename: str) -> Optional[BeautifulSoup]:
"""
Gets xml bs tree from the given file inside the self.path.
:param filename: name of file to extract the tree
:return: BeautifulSoup tree or None if file wasn't found
"""
try:
with zipfile.ZipFile(self.path) as document:
content = document.read(filename)
content = re.sub(br"\n[\t ]*", b"", content)
soup = BeautifulSoup(content, "xml")
return soup
except KeyError:
return None
except zipfile.BadZipFile:
raise BadFileFormatError(f"Bad docx file:\n file_name = {os.path.basename(self.path)}. Seems docx is broken")

def __handle_table_xml(self, xml: Tag, table_refs: dict) -> None:
table = DocxTable(xml, self.paragraph_maker)
self.tables.append(table.to_table())
Expand All @@ -150,9 +131,9 @@ def __handle_table_xml(self, xml: Tag, table_refs: dict) -> None:
table_refs[len(self.paragraph_list) - 1].append(table_uid)

def __handle_images_xml(self, xmls: List[Tag], image_refs: dict) -> None:
rels = self.__get_bs_tree("word/_rels/document.xml.rels")
rels = get_bs_from_zip(self.path, "word/_rels/document.xml.rels")
if rels is None:
rels = self.__get_bs_tree("word/_rels/document2.xml.rels")
rels = get_bs_from_zip(self.path, "word/_rels/document2.xml.rels")

images_rels = dict()
for rel in rels.find_all("Relationship"):
Expand Down
51 changes: 51 additions & 0 deletions dedoc/readers/pptx_reader/numbering_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
class NumberingExtractor:
"""
This class is used to compute numbering text for list items.
For example: "1.", (i), "○"
"""
def __init__(self) -> None:
# Mapping according to the ST_TextAutonumberScheme
# NOTE we ignore chinese, japanese, hindi, thai
self.numbering_types = dict(
arabic="1", # 1, 2, 3, ..., 10, 11, 12, ...
alphaLc="a", # a, b, c, ..., y, z, aa, bb, cc, ..., yy, zz, aaa, bbb, ccc, ...
alphaUc="A", # A, B, C, ..., Y, Z, AA, BB, CC, ..., YY, ZZ, AAA, BBB, CCC, ...
romanLc="i", # i, ii, iii, iv, ..., xviii, xix, xx, xxi, ...
romanUc="I" # I, II, III, IV, ..., XVIII, XIX, XX, XXI, ...
)

self.numbering_formatting = dict(
ParenBoth="({}) ",
ParenR="{}) ",
Period="{}. ",
Plain="{} "
)

self.combined_types = {
num_type + num_formatting: (num_type, num_formatting) for num_type in self.numbering_types for num_formatting in self.numbering_formatting
}
self.roman_mapping = [(1000, "m"), (500, "d"), (100, "c"), (50, "l"), (10, "x"), (5, "v"), (1, "i")]

def get_text(self, numbering: str, shift: int) -> str:
"""
Computes the next item of the list sequence.
:param numbering: type of the numbering, e.g. "arabicPeriod"
:param shift: shift from the beginning of list numbering
:return: string representation of the next numbering item
"""
num_type, num_formatting = self.combined_types.get(numbering, ("arabic", "Period"))

if num_type in ("alphaLc", "alphaUc"):
shift1, shift2 = shift % 26, shift // 26 + 1
num_char = chr(ord(self.numbering_types[num_type]) + shift1) * shift2
elif num_type in ("romanLc", "romanUc"):
num_char = ""
for number, letter in self.roman_mapping:
cnt, shift = shift // number, shift % number
if num_type == "romanUc":
letter = chr(ord(letter) + ord("A") - ord("a"))
num_char += letter * cnt
else:
num_char = str(int(self.numbering_types["arabic"]) + shift)

return self.numbering_formatting[num_formatting].format(num_char)
55 changes: 55 additions & 0 deletions dedoc/readers/pptx_reader/paragraph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from bs4 import Tag

from dedoc.data_structures import AlignmentAnnotation, BoldAnnotation, HierarchyLevel, ItalicAnnotation, LineMetadata, LineWithMeta, SizeAnnotation, \
StrikeAnnotation, SubscriptAnnotation, SuperscriptAnnotation, UnderlinedAnnotation
from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor
from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor
from dedoc.utils.annotation_merger import AnnotationMerger


class PptxParagraph:
"""
This class corresponds to one textual paragraph of some entity, e.g. shape or table cell (tag <a:p>).
"""
def __init__(self, xml: Tag, numbering_extractor: NumberingExtractor, properties_extractor: PropertiesExtractor) -> None:
self.xml = xml
self.numbered_list_type = self.xml.buAutoNum.get("type", "arabicPeriod") if self.xml.buAutoNum else None
self.level = int(self.xml.pPr.get("lvl", 0)) + 1 if self.xml.pPr else 1
self.numbering_extractor = numbering_extractor
self.properties_extractor = properties_extractor
self.annotation_merger = AnnotationMerger()
annotations = [BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation]
self.dict2annotation = {annotation.name: annotation for annotation in annotations}

def get_line_with_meta(self, page_id: int, line_id: int, is_title: bool, shift: int = 0) -> LineWithMeta:
text = ""
paragraph_properties = self.properties_extractor.get_properties(self.xml.pPr, level=self.level)
hierarchy_level = HierarchyLevel.create_raw_text()

if is_title or paragraph_properties.title:
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.header, level_1=1, level_2=self.level, can_be_multiline=False)
elif self.numbered_list_type: # numbered list
text += self.numbering_extractor.get_text(self.numbered_list_type, shift)
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=2, level_2=self.level, can_be_multiline=False)
elif self.xml.buChar: # bullet list
text += self.xml.buChar["char"] + " "
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=3, level_2=self.level, can_be_multiline=False)

annotations = []
if self.xml.r:
for run in self.xml.find_all("a:r"):
prev_text = text
for run_text in run:
if run_text.name == "t" and run.text:
text += run.text

run_properties = self.properties_extractor.get_properties(run.rPr, level=self.level, properties=paragraph_properties)
annotations.append(SizeAnnotation(start=len(prev_text), end=len(text), value=str(run_properties.size)))
for property_name in self.dict2annotation:
if getattr(run_properties, property_name):
annotations.append(self.dict2annotation[property_name](start=len(prev_text), end=len(text), value="True"))

text = f"{text}\n"
annotations = self.annotation_merger.merge_annotations(annotations, text)
annotations.append(AlignmentAnnotation(start=0, end=len(text), value=paragraph_properties.alignment))
return LineWithMeta(text, metadata=LineMetadata(page_id=page_id, line_id=line_id, tag_hierarchy_level=hierarchy_level), annotations=annotations)
Loading

0 comments on commit 370f6ef

Please sign in to comment.