update master (#451)

Co-authored-by: Andrew Perminov <[email protected]>
ispras · Jun 5, 2024 · 370f6ef · 370f6ef
1 parent 6b84563
commit 370f6ef
Show file tree

Hide file tree

Showing 22 changed files with 672 additions and 111 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.2.2
+2.2.3
diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
@@ -17,7 +17,6 @@ class QueryParameters:
     need_content_analysis: str = Form("false", enum=["true", "false"], description="Enable parsing contents of the attached files")
     recursion_deep_attachments: str = Form("10", description="Depth on which nested attachments will be parsed if need_content_analysis=true")
     return_base64: str = Form("false", enum=["true", "false"], description="Save attached images to the document metadata in base64 format")
-    attachments_dir: Optional[str] = Form(None, description="Path to the directory where to save files' attachments")
 
     # tables handling
     need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf")

diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py
@@ -14,6 +14,7 @@
 from dedoc.data_structures.parsed_document import ParsedDocument
 from dedoc.data_structures.table import Table
 from dedoc.data_structures.tree_node import TreeNode
+from dedoc.extensions import converted_mimes, recognized_mimes
 
 
 def __prettify_text(text: str) -> Iterator[str]:
@@ -148,11 +149,22 @@ def json2html(text: str,
             text += table2html(table, table2id)
             text += "<p>&nbsp;</p>"
 
+    image_mimes = recognized_mimes.image_like_format.union(converted_mimes.image_like_format)
+
     if attachments is not None and len(attachments) > 0:
         text += "<h3> Attachments: </h3>"
         for attachment_id, attachment in enumerate(attachments):
             attachment_text = json2html(text="", paragraph=attachment.content.structure, tables=attachment.content.tables, attachments=attachment.attachments)
-            text += f'<div id="{attachment.metadata.uid}"><h4>attachment {attachment_id} ({attachment.metadata.file_name}):</h4>{attachment_text}</div>'
+            attachment_base64 = f'data:{attachment.metadata.file_type};base64,{attachment.metadata.base64}"'
+            attachment_link = f'<a href="{attachment_base64}" download="{attachment.metadata.file_name}">{attachment.metadata.file_name}</a>'
+            is_image = attachment.metadata.file_type in image_mimes
+            attachment_image = f'<img src="{attachment_base64}">' if is_image else ""
+
+            text += f"""<div id="{attachment.metadata.uid}">
+                <h4>attachment {attachment_id} ({attachment_link}):</h4>
+                {attachment_image}
+                {attachment_text}
+            </div>"""
 
     return text
 
@@ -193,12 +205,9 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int], attach2id:
         name = annotation.name
         value = annotation.value
 
-        bool_annotations = [BoldAnnotation.name,
-                            ItalicAnnotation.name,
-                            StrikeAnnotation.name,
-                            SubscriptAnnotation.name,
-                            SuperscriptAnnotation.name,
-                            UnderlinedAnnotation.name]
+        bool_annotations = [
+            BoldAnnotation.name, ItalicAnnotation.name, StrikeAnnotation.name, SubscriptAnnotation.name, SuperscriptAnnotation.name, UnderlinedAnnotation.name
+        ]
         check_annotations = bool_annotations + [TableAnnotation.name, ReferenceAnnotation.name, AttachAnnotation.name]
         if name not in check_annotations and not value.startswith("heading "):
             continue

diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py
@@ -1,3 +1,4 @@
+import base64
 import dataclasses
 import importlib
 import json
@@ -62,41 +63,57 @@ def _get_static_file_path(request: Request) -> str:
     return os.path.abspath(os.path.join(directory, file))
 
 
+def __add_base64_info_to_attachments(document_tree: ParsedDocument, attachments_dir: str) -> None:
+    for attachment in document_tree.attachments:
+        with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file:
+            attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8"))
+
+
 @app.post("/upload", response_model=ParsedDocument)
 async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response:  # noqa
     parameters = dataclasses.asdict(query_params)
     if not file or file.filename == "":
         raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.__version__)
 
+    return_format = str(parameters.get("return_format", "json")).lower()
+
     with tempfile.TemporaryDirectory() as tmpdir:
         file_path = save_upload_file(file, tmpdir)
-        document_tree = manager.parse(file_path, parameters=dict(parameters))
+        document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmpdir})
+
+        if return_format == "html":
+            __add_base64_info_to_attachments(document_tree, tmpdir)
 
-    return_format = str(parameters.get("return_format", "json")).lower()
     if return_format == "html":
         html_content = json2html(
             text="",
             paragraph=document_tree.content.structure,
             tables=document_tree.content.tables,
-            attachments=document_tree.attachments, tabs=0
+            attachments=document_tree.attachments,
+            tabs=0
         )
         return HTMLResponse(content=html_content)
-    elif return_format == "plain_text":
+
+    if return_format == "plain_text":
         txt_content = json2txt(paragraph=document_tree.content.structure)
         return PlainTextResponse(content=txt_content)
-    elif return_format == "tree":
+
+    if return_format == "tree":
         html_content = json2tree(paragraph=document_tree.content.structure)
         return HTMLResponse(content=html_content)
-    elif return_format == "ujson":
+
+    if return_format == "ujson":
         return UJSONResponse(content=document_tree.to_api_schema().model_dump())
-    elif return_format == "collapsed_tree":
+
+    if return_format == "collapsed_tree":
         html_content = json2collapsed_tree(paragraph=document_tree.content.structure)
         return HTMLResponse(content=html_content)
-    elif return_format == "pretty_json":
+
+    if return_format == "pretty_json":
         return PlainTextResponse(content=json.dumps(document_tree.to_api_schema().model_dump(), ensure_ascii=False, indent=2))
-    else:
-        logger.info(f"Send result. File {file.filename} with parameters {parameters}")
-        return ORJSONResponse(content=document_tree.to_api_schema().model_dump())
+
+    logger.info(f"Send result. File {file.filename} with parameters {parameters}")
+    return ORJSONResponse(content=document_tree.to_api_schema().model_dump())
 
 
 @app.get("/upload_example")

diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html
@@ -70,7 +70,7 @@ <h4>Type of document structure parsing</h4>
 
         <div class="parameters">
             <h4>Attachments handling</h4>
-            <details><summary>with_attachments, need_content_analysis, recursion_deep_attachments, return_base64, attachments_dir</summary>
+            <details><summary>with_attachments, need_content_analysis, recursion_deep_attachments, return_base64</summary>
                 <br>
                 <p>
                     <label><input name="with_attachments" type="checkbox" value="true"> with_attachments </label>
@@ -87,10 +87,6 @@ <h4>Attachments handling</h4>
                 <p>
                     <label><input name="return_base64" type="checkbox" value="true"> return_base64 </label>
                 </p>
-
-                <p>
-                    <label>attachments_dir <input name="attachments_dir" type="text" size="35" value=""></label>
-                </p>
             </details>
         </div>
 

diff --git a/dedoc/data_structures/document_metadata.py b/dedoc/data_structures/document_metadata.py
@@ -1,5 +1,5 @@
 import uuid
-from typing import Dict, Union
+from typing import Any, Dict, Union
 
 from dedoc.api.schema.document_metadata import DocumentMetadata as ApiDocumentMetadata
 from dedoc.data_structures.serializable import Serializable
@@ -38,8 +38,11 @@ def __init__(self,
         self.access_time = access_time
         self.file_type = file_type
         for key, value in kwargs.items():
-            setattr(self, key, value)
+            self.add_attribute(key, value)
         self.uid = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid
 
+    def add_attribute(self, key: str, value: Any) -> None:  # noqa
+        setattr(self, key, value)
+
     def to_api_schema(self) -> ApiDocumentMetadata:
         return ApiDocumentMetadata(**vars(self))
diff --git a/dedoc/readers/docx_reader/data_structures/docx_document.py b/dedoc/readers/docx_reader/data_structures/docx_document.py
@@ -1,14 +1,11 @@
 import hashlib
 import logging
-import os
 import re
-import zipfile
 from collections import defaultdict
-from typing import List, Optional
+from typing import List
 
 from bs4 import BeautifulSoup, Tag
 
-from dedoc.common.exceptions.bad_file_error import BadFileFormatError
 from dedoc.data_structures.attached_file import AttachedFile
 from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation
 from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation
@@ -19,6 +16,7 @@
 from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter
 from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
 from dedoc.readers.docx_reader.styles_extractor import StylesExtractor
+from dedoc.utils.office_utils import get_bs_from_zip
 from dedoc.utils.utils import calculate_file_hash
 
 
@@ -28,8 +26,8 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L
         self.path = path
         self.attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments}
 
-        self.document_bs_tree = self.__get_bs_tree("word/document.xml")
-        self.document_bs_tree = self.__get_bs_tree("word/document2.xml") if self.document_bs_tree is None else self.document_bs_tree
+        self.document_bs_tree = get_bs_from_zip(self.path, "word/document.xml")
+        self.document_bs_tree = get_bs_from_zip(self.path, "word/document2.xml") if self.document_bs_tree is None else self.document_bs_tree
         self.body = self.document_bs_tree.body if self.document_bs_tree else None
         self.paragraph_maker = self.__get_paragraph_maker()
 
@@ -39,8 +37,8 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L
         self.lines = self.__get_lines()
 
     def __get_paragraph_maker(self) -> ParagraphMaker:
-        styles_extractor = StylesExtractor(self.__get_bs_tree("word/styles.xml"), self.logger)
-        num_tree = self.__get_bs_tree("word/numbering.xml")
+        styles_extractor = StylesExtractor(get_bs_from_zip(self.path, "word/styles.xml"), self.logger)
+        num_tree = get_bs_from_zip(self.path, "word/numbering.xml")
         numbering_extractor = NumberingExtractor(num_tree, styles_extractor) if num_tree else None
         styles_extractor.numbering_extractor = numbering_extractor
 
@@ -49,8 +47,8 @@ def __get_paragraph_maker(self) -> ParagraphMaker:
             path_hash=calculate_file_hash(path=self.path),
             styles_extractor=styles_extractor,
             numbering_extractor=numbering_extractor,
-            footnote_extractor=FootnoteExtractor(self.__get_bs_tree("word/footnotes.xml")),
-            endnote_extractor=FootnoteExtractor(self.__get_bs_tree("word/endnotes.xml"), key="endnote")
+            footnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")),
+            endnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote")
         )
 
     def __get_lines(self) -> List[LineWithMeta]:
@@ -120,23 +118,6 @@ def __paragraphs2lines(self, image_refs: dict, table_refs: dict, diagram_refs: d
 
         return lines_with_meta
 
-    def __get_bs_tree(self, filename: str) -> Optional[BeautifulSoup]:
-        """
-        Gets xml bs tree from the given file inside the self.path.
-        :param filename: name of file to extract the tree
-        :return: BeautifulSoup tree or None if file wasn't found
-        """
-        try:
-            with zipfile.ZipFile(self.path) as document:
-                content = document.read(filename)
-                content = re.sub(br"\n[\t ]*", b"", content)
-                soup = BeautifulSoup(content, "xml")
-                return soup
-        except KeyError:
-            return None
-        except zipfile.BadZipFile:
-            raise BadFileFormatError(f"Bad docx file:\n file_name = {os.path.basename(self.path)}. Seems docx is broken")
-
     def __handle_table_xml(self, xml: Tag, table_refs: dict) -> None:
         table = DocxTable(xml, self.paragraph_maker)
         self.tables.append(table.to_table())
@@ -150,9 +131,9 @@ def __handle_table_xml(self, xml: Tag, table_refs: dict) -> None:
             table_refs[len(self.paragraph_list) - 1].append(table_uid)
 
     def __handle_images_xml(self, xmls: List[Tag], image_refs: dict) -> None:
-        rels = self.__get_bs_tree("word/_rels/document.xml.rels")
+        rels = get_bs_from_zip(self.path, "word/_rels/document.xml.rels")
         if rels is None:
-            rels = self.__get_bs_tree("word/_rels/document2.xml.rels")
+            rels = get_bs_from_zip(self.path, "word/_rels/document2.xml.rels")
 
         images_rels = dict()
         for rel in rels.find_all("Relationship"):

diff --git a/dedoc/readers/pptx_reader/numbering_extractor.py b/dedoc/readers/pptx_reader/numbering_extractor.py
@@ -0,0 +1,51 @@
+class NumberingExtractor:
+    """
+    This class is used to compute numbering text for list items.
+    For example: "1.", (i), "○"
+    """
+    def __init__(self) -> None:
+        # Mapping according to the ST_TextAutonumberScheme
+        # NOTE we ignore chinese, japanese, hindi, thai
+        self.numbering_types = dict(
+            arabic="1",  # 1, 2, 3, ..., 10, 11, 12, ...
+            alphaLc="a",  # a, b, c, ..., y, z, aa, bb, cc, ..., yy, zz, aaa, bbb, ccc, ...
+            alphaUc="A",  # A, B, C, ..., Y, Z, AA, BB, CC, ..., YY, ZZ, AAA, BBB, CCC, ...
+            romanLc="i",  # i, ii, iii, iv, ..., xviii, xix, xx, xxi, ...
+            romanUc="I"  # I, II, III, IV, ..., XVIII, XIX, XX, XXI, ...
+        )
+
+        self.numbering_formatting = dict(
+            ParenBoth="({}) ",
+            ParenR="{}) ",
+            Period="{}. ",
+            Plain="{} "
+        )
+
+        self.combined_types = {
+            num_type + num_formatting: (num_type, num_formatting) for num_type in self.numbering_types for num_formatting in self.numbering_formatting
+        }
+        self.roman_mapping = [(1000, "m"), (500, "d"), (100, "c"), (50, "l"), (10, "x"), (5, "v"), (1, "i")]
+
+    def get_text(self, numbering: str, shift: int) -> str:
+        """
+        Computes the next item of the list sequence.
+        :param numbering: type of the numbering, e.g. "arabicPeriod"
+        :param shift: shift from the beginning of list numbering
+        :return: string representation of the next numbering item
+        """
+        num_type, num_formatting = self.combined_types.get(numbering, ("arabic", "Period"))
+
+        if num_type in ("alphaLc", "alphaUc"):
+            shift1, shift2 = shift % 26, shift // 26 + 1
+            num_char = chr(ord(self.numbering_types[num_type]) + shift1) * shift2
+        elif num_type in ("romanLc", "romanUc"):
+            num_char = ""
+            for number, letter in self.roman_mapping:
+                cnt, shift = shift // number, shift % number
+                if num_type == "romanUc":
+                    letter = chr(ord(letter) + ord("A") - ord("a"))
+                num_char += letter * cnt
+        else:
+            num_char = str(int(self.numbering_types["arabic"]) + shift)
+
+        return self.numbering_formatting[num_formatting].format(num_char)
diff --git a/dedoc/readers/pptx_reader/paragraph.py b/dedoc/readers/pptx_reader/paragraph.py
@@ -0,0 +1,55 @@
+from bs4 import Tag
+
+from dedoc.data_structures import AlignmentAnnotation, BoldAnnotation, HierarchyLevel, ItalicAnnotation, LineMetadata, LineWithMeta, SizeAnnotation, \
+    StrikeAnnotation, SubscriptAnnotation, SuperscriptAnnotation, UnderlinedAnnotation
+from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor
+from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor
+from dedoc.utils.annotation_merger import AnnotationMerger
+
+
+class PptxParagraph:
+    """
+    This class corresponds to one textual paragraph of some entity, e.g. shape or table cell (tag <a:p>).
+    """
+    def __init__(self, xml: Tag, numbering_extractor: NumberingExtractor, properties_extractor: PropertiesExtractor) -> None:
+        self.xml = xml
+        self.numbered_list_type = self.xml.buAutoNum.get("type", "arabicPeriod") if self.xml.buAutoNum else None
+        self.level = int(self.xml.pPr.get("lvl", 0)) + 1 if self.xml.pPr else 1
+        self.numbering_extractor = numbering_extractor
+        self.properties_extractor = properties_extractor
+        self.annotation_merger = AnnotationMerger()
+        annotations = [BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation]
+        self.dict2annotation = {annotation.name: annotation for annotation in annotations}
+
+    def get_line_with_meta(self, page_id: int, line_id: int, is_title: bool, shift: int = 0) -> LineWithMeta:
+        text = ""
+        paragraph_properties = self.properties_extractor.get_properties(self.xml.pPr, level=self.level)
+        hierarchy_level = HierarchyLevel.create_raw_text()
+
+        if is_title or paragraph_properties.title:
+            hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.header, level_1=1, level_2=self.level, can_be_multiline=False)
+        elif self.numbered_list_type:  # numbered list
+            text += self.numbering_extractor.get_text(self.numbered_list_type, shift)
+            hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=2, level_2=self.level, can_be_multiline=False)
+        elif self.xml.buChar:  # bullet list
+            text += self.xml.buChar["char"] + " "
+            hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=3, level_2=self.level, can_be_multiline=False)
+
+        annotations = []
+        if self.xml.r:
+            for run in self.xml.find_all("a:r"):
+                prev_text = text
+                for run_text in run:
+                    if run_text.name == "t" and run.text:
+                        text += run.text
+
+                run_properties = self.properties_extractor.get_properties(run.rPr, level=self.level, properties=paragraph_properties)
+                annotations.append(SizeAnnotation(start=len(prev_text), end=len(text), value=str(run_properties.size)))
+                for property_name in self.dict2annotation:
+                    if getattr(run_properties, property_name):
+                        annotations.append(self.dict2annotation[property_name](start=len(prev_text), end=len(text), value="True"))
+
+        text = f"{text}\n"
+        annotations = self.annotation_merger.merge_annotations(annotations, text)
+        annotations.append(AlignmentAnnotation(start=0, end=len(text), value=paragraph_properties.alignment))
+        return LineWithMeta(text, metadata=LineMetadata(page_id=page_id, line_id=line_id, tag_hierarchy_level=hierarchy_level), annotations=annotations)