Make train dataset API separated (#396)

* Make train dataset API separated * Fix tests * Add API tests and fix some bugs * Fix resources path in train scripts
ispras · Jan 17, 2024 · 0b7ea01 · 0b7ea01
1 parent 9ef562b
commit 0b7ea01
Show file tree

Hide file tree

Showing 146 changed files with 5,404 additions and 845 deletions.
diff --git a/.flake8 b/.flake8
@@ -4,7 +4,7 @@ max-line-length = 160
 max-complexity = 13
 inline-quotes = "
 
-application-import-names = dedoc, tests
+application-import-names = dedoc, tests, scripts, train_dataset
 import-order-style = pycharm
 
 exclude =
@@ -14,8 +14,7 @@ exclude =
     .github,
     *__init__.py,
     resources,
-    dedoc/scripts,
-    examples,
+    scripts,
     venv,
     build,
     dedoc.egg-info

diff --git a/.github/workflows/test_labeling.yaml b/.github/workflows/test_labeling.yaml
@@ -0,0 +1,34 @@
+name: CI
+
+# Controls when the action will run. 
+on:
+  pull_request:
+    branches:
+      - develop
+      - master
+    paths-ignore:
+      - 'VERSION'
+      - 'docs/source/changelog.rst'
+  push:
+    branches:
+    - develop
+    - master
+    paths-ignore:
+      - 'VERSION'
+      - 'docs/source/changelog.rst'
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+jobs:
+  labeling:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout repo
+      uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9'
+    - name: Run tests for labeling
+      run: |
+        test="true" docker-compose -f labeling/docker-compose.yml up --build --exit-code-from test
diff --git a/.gitignore b/.gitignore
@@ -96,7 +96,6 @@ ENV/
 [Ll]ib
 [Ll]ib64
 [Ll]ocal
-[Ss]cripts
 pyvenv.cfg
 .venv
 pip-selfcheck.json

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
     rev: 5.0.4
     hooks:
     -   id: flake8
-        exclude: \.github|.*__init__\.py|resources|dedoc/scripts|examples|docs|venv|build|dedoc\.egg-info
+        exclude: \.github|.*__init__\.py|resources|scripts|examples|docs|venv|build|dedoc\.egg-info
         args:
             - "--config=.flake8"
         additional_dependencies: [

diff --git a/Dockerfile b/Dockerfile
@@ -20,4 +20,4 @@ RUN echo "__version__ = \"$(cat /dedoc_root/VERSION)\"" > /dedoc_root/dedoc/vers
 ADD tests /dedoc_root/tests
 ADD resources /dedoc_root/resources
 
-CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"]
+CMD ["python3", "/dedoc_root/dedoc/main.py"]
diff --git a/dedoc/config.py b/dedoc/config.py
@@ -1,8 +1,6 @@
-import importlib.util
 import logging
 import os
 import sys
-from typing import Any, Optional
 
 logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s")
 
@@ -13,6 +11,7 @@
     # -----------------------------------------RESOURCES PATH SETTINGS----------------------------------------------------
     resources_path=RESOURCES_PATH,
     intermediate_data_path=os.path.join(RESOURCES_PATH, "datasets"),
+    table_path="/tmp/tables",
 
     # -----------------------------------------COMMON DEBUG SETTINGS----------------------------------------------------
     debug_mode=DEBUG_MODE,
@@ -66,20 +65,11 @@ def get_instance(cls: "Configuration") -> "Configuration":
 
         return cls.__instance
 
-    def __init_config(self, args: Optional[Any] = None) -> None:
-        if args is not None and args.config_path is not None:
-            spec = importlib.util.spec_from_file_location("config_module", args.config_path)
-            config_module = importlib.util.module_from_spec(spec)
-            spec.loader.exec_module(config_module)
-            self.__config = config_module._config
-        else:
+    def get_config(self) -> dict:
+        if self.__config is None:
             self.__config = _config
-
-    def get_config(self, args: Optional[Any] = None) -> dict:
-        if self.__config is None or args is not None:
-            self.__init_config(args)
         return self.__config
 
 
-def get_config(args: Optional[Any] = None) -> dict:
-    return Configuration.get_instance().get_config(args)
+def get_config() -> dict:
+    return Configuration.get_instance().get_config()
diff --git a/dedoc/dedoc_manager.py b/dedoc/dedoc_manager.py
@@ -10,7 +10,7 @@
 from dedoc.data_structures import ParsedDocument, UnstructuredDocument
 from dedoc.manager_config import get_manager_config
 from dedoc.metadata_extractors import BaseMetadataExtractor
-from dedoc.train_dataset.train_dataset_utils import get_path_original_documents, save_line_with_meta
+from dedoc.utils.train_dataset_utils import get_path_original_documents, save_line_with_meta
 from dedoc.utils.utils import get_unique_name
 
 
@@ -114,7 +114,7 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str])
             self.logger.info(f"Extract structure from file {file_name}")
 
             if self.config.get("labeling_mode", False):
-                self.__save(os.path.join(tmp_dir, unique_filename), unstructured_document)
+                self.__save(converted_file_path, unstructured_document)
 
             # Step 5 - Form the output structure
             parsed_document = self.structure_constructor.construct(document=unstructured_document, parameters=parameters)
@@ -141,5 +141,6 @@ def __init_parameters(self, file_path: str, parameters: Optional[dict]) -> dict:
         return result_parameters
 
     def __save(self, file_path: str, classified_document: UnstructuredDocument) -> None:
+        self.logger.info(f'Save document lines to {self.config["intermediate_data_path"]}')
         save_line_with_meta(lines=classified_document.lines, config=self.config, original_document=os.path.basename(file_path))
         shutil.copy(file_path, os.path.join(get_path_original_documents(self.config), os.path.basename(file_path)))
diff --git a/dedoc/main.py b/dedoc/main.py
@@ -1,26 +1,7 @@
-import argparse
-
 from dedoc.api.dedoc_api import get_api, run_api  # noqa
-from dedoc.config import Configuration, get_config
-
-
-def main() -> None:
-    run_api(get_api())
+from dedoc.config import Configuration
 
 
 if __name__ == "__main__":
-    parser_config = argparse.ArgumentParser()
-    parser_config.add_argument("-c", "--config_path", help="path to configuration file")
-    parser_config.add_argument("-m", "--module", help="Only for tests")
-    parser_config.add_argument("-f", "--test_files", metavar="VALUE", nargs="*", help="Only for tests")
-    parser_config.add_argument("-v", "--unitest_verbose_mode", nargs="?", help="to enable verbose mode of unittest. Only for tests")
-
-    args_config = parser_config.parse_args()
-    Configuration.get_instance().get_config(args_config)
-    config = get_config()
-
-    if config.get("labeling_mode", False):
-        from api.train_dataset.train_dataset_api import run_special_api  # noqa
-        run_special_api()
-    else:
-        main()
+    Configuration.get_instance().get_config()
+    run_api(get_api())
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py
@@ -24,6 +24,9 @@ def text(self) -> str:
         return " ".join(word.text for word in self.words if word.text != "") + "\n"
 
     def get_annotations(self, page_width: int, page_height: int, extract_line_bbox: bool) -> List[Annotation]:
+        if extract_line_bbox:
+            return [BBoxAnnotation(0, len(" ".join([w.text for w in self.words])), self.bbox, page_width, page_height)]
+
         start = 0
         annotations = []
 
@@ -35,8 +38,7 @@ def get_annotations(self, page_width: int, page_height: int, extract_line_bbox:
             annotations.append(ConfidenceAnnotation(start, end, str(word.confidence / 100)))
             annotations.append(BBoxAnnotation(start, end, word.bbox, page_width, page_height))
             start += len(word.text) + 1
-        if extract_line_bbox:
-            annotations.append(BBoxAnnotation(0, start, self.bbox, page_width, page_height))
+
         return annotations
 
     @staticmethod

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
@@ -14,7 +14,6 @@
 from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader
 from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier
 from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor
-from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox
 from dedoc.utils import supported_image_types
 from dedoc.utils.parameter_utils import get_path_param
 from dedoc.utils.utils import get_mime_extension
@@ -94,9 +93,6 @@ def _process_one_page(self,
         page = self.ocr.split_image2lines(image=clean_image, language=parameters.language, is_one_column_document=is_one_column_document, page_num=page_number)
 
         lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page)
-        if self.config.get("labeling_mode"):
-            save_page_with_bbox(page=page, config=self.config, document_name=os.path.basename(path))
-
         return lines, tables, page.attachments, [angle]
 
     def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool, float]:

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py
@@ -1,6 +1,7 @@
 import json
 import logging
 import os
+import time
 from typing import List, Optional, Tuple
 
 import cv2
@@ -13,7 +14,6 @@
 from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.multipage_table_extractor import MultiPageTableExtractor
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import OnePageTableExtractor
-from dedoc.train_dataset.data_path_config import table_path as save_path
 
 """-------------------------------------entry class of Table Recognizer Module---------------------------------------"""
 
@@ -73,7 +73,7 @@ def __rec_tables_from_img(self,
             orient_cell_angle=orient_cell_angle,
             table_type=table_type)
         if self.config.get("labeling_mode", False):
-            self.__save_tables(tables=single_page_tables, image=src_image, table_path=save_path)
+            self.__save_tables(tables=single_page_tables, image=src_image, table_path=self.config.get("table_path", "/tmp/tables"))
         if self.table_type.detect_one_cell_table in table_type:
             filtered_tables = single_page_tables
         else:
@@ -142,9 +142,9 @@ def __save_tables(self, tables: List[ScanTable], image: np.ndarray, table_path:
         image = Image.fromarray(image)
         os.makedirs(table_path, exist_ok=True)
         for table in tables:
-            images_cnt = len(os.listdir(table_path))
-            image_path = os.path.join(table_path, f"{images_cnt:06d}.png")
-            jsons_path = os.path.join(table_path, f"{images_cnt:06d}.json")
+            file_name = str(int(time.time()))
+            image_path = os.path.join(table_path, f"{file_name}.png")
+            jsons_path = os.path.join(table_path, f"{file_name}.json")
             image.save(image_path)
             with open(jsons_path, "w") as out:
                 json.dump(obj=table.to_dict(), fp=out, indent=4, ensure_ascii=False)
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
@@ -207,6 +207,7 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
         lines = []
         page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"])
         prev_line = None
+        labeling_mode = self.config.get("labeling_mode", False)
 
         for block in page["blocks"]:
             annotations = []
@@ -219,9 +220,12 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
             for annotation in block["annotations"]:
                 start = annotation["start"]
                 end = annotation["end"]
-                box = BBox(x_top_left=int(annotation["x_top_left"]), y_top_left=int(annotation["y_top_left"]),
-                           width=int(annotation["width"]), height=int(annotation["height"]))
-                annotations.append(BBoxAnnotation(start, end, box, page_width=page_width, page_height=page_height))
+
+                if not labeling_mode:
+                    box = BBox(x_top_left=int(annotation["x_top_left"]), y_top_left=int(annotation["y_top_left"]),
+                               width=int(annotation["width"]), height=int(annotation["height"]))
+                    annotations.append(BBoxAnnotation(start, end, box, page_width=page_width, page_height=page_height))
+
                 annotations.append(SizeAnnotation(start, end, str(annotation["font_size"])))
                 annotations.append(StyleAnnotation(start, end, annotation["font_name"]))
 
@@ -235,7 +239,7 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
                     annotations.append(LinkedTextAnnotation(start, end, annotation["url"]))
 
             bbox = BBox(x_top_left=int(block["x_top_left"]), y_top_left=int(block["y_top_left"]), width=int(block["width"]), height=int(block["height"]))
-            if self.config.get("labeling_mode", False):
+            if labeling_mode:
                 annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height))
 
             meta = block["metadata"].lower()

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
@@ -1,4 +1,3 @@
-import os
 from typing import List, Optional, Tuple
 
 import numpy as np
@@ -10,7 +9,6 @@
 from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
 from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader
 from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor
-from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox
 from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer
 from dedoc.utils.utils import get_mime_extension
 
@@ -66,9 +64,6 @@ def _process_one_page(self,
         lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
         self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)
 
-        if self.config.get("labeling_mode"):
-            save_page_with_bbox(page=page, config=self.config, document_name=os.path.basename(path))
-
         return lines, tables, page.attachments, []
 
     def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None:

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py
@@ -37,7 +37,7 @@
 
 class PdfminerExtractor(object):
     """
-    Class extarcts text with style from pdf with help pdfminer.six
+    Class extracts text with style from pdf with help pdfminer.six
     """
 
     def __init__(self, *, config: dict) -> None:

diff --git a/dedoc/structure_extractors/line_type_classifiers/abstract_pickled_classifier.py b/dedoc/structure_extractors/line_type_classifiers/abstract_pickled_classifier.py
@@ -7,7 +7,6 @@
 
 from xgboost import XGBClassifier
 
-
 from dedoc.download_models import download_from_hub
 from dedoc.structure_extractors.line_type_classifiers.abstract_line_type_classifier import AbstractLineTypeClassifier
 from dedoc.utils.parameter_utils import get_param_gpu_available

diff --git a/dedoc/train_dataset/data_path_config.py b/dedoc/train_dataset/data_path_config.py