Skip to content

Commit

Permalink
Make train dataset API separated (#396)
Browse files Browse the repository at this point in the history
* Make train dataset API separated

* Fix tests

* Add API tests and fix some bugs

* Fix resources path in train scripts
  • Loading branch information
NastyBoget authored Jan 17, 2024
1 parent 9ef562b commit 0b7ea01
Show file tree
Hide file tree
Showing 146 changed files with 5,404 additions and 845 deletions.
5 changes: 2 additions & 3 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ max-line-length = 160
max-complexity = 13
inline-quotes = "
application-import-names = dedoc, tests
application-import-names = dedoc, tests, scripts, train_dataset
import-order-style = pycharm
exclude =
Expand All @@ -14,8 +14,7 @@ exclude =
.github,
*__init__.py,
resources,
dedoc/scripts,
examples,
scripts,
venv,
build,
dedoc.egg-info
Expand Down
34 changes: 34 additions & 0 deletions .github/workflows/test_labeling.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: CI

# Controls when the action will run.
on:
pull_request:
branches:
- develop
- master
paths-ignore:
- 'VERSION'
- 'docs/source/changelog.rst'
push:
branches:
- develop
- master
paths-ignore:
- 'VERSION'
- 'docs/source/changelog.rst'
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

jobs:
labeling:
runs-on: ubuntu-latest
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: '3.9'
- name: Run tests for labeling
run: |
test="true" docker-compose -f labeling/docker-compose.yml up --build --exit-code-from test
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ ENV/
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
.venv
pip-selfcheck.json
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
exclude: \.github|.*__init__\.py|resources|dedoc/scripts|examples|docs|venv|build|dedoc\.egg-info
exclude: \.github|.*__init__\.py|resources|scripts|examples|docs|venv|build|dedoc\.egg-info
args:
- "--config=.flake8"
additional_dependencies: [
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ RUN echo "__version__ = \"$(cat /dedoc_root/VERSION)\"" > /dedoc_root/dedoc/vers
ADD tests /dedoc_root/tests
ADD resources /dedoc_root/resources

CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"]
CMD ["python3", "/dedoc_root/dedoc/main.py"]
20 changes: 5 additions & 15 deletions dedoc/config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import importlib.util
import logging
import os
import sys
from typing import Any, Optional

logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s")

Expand All @@ -13,6 +11,7 @@
# -----------------------------------------RESOURCES PATH SETTINGS----------------------------------------------------
resources_path=RESOURCES_PATH,
intermediate_data_path=os.path.join(RESOURCES_PATH, "datasets"),
table_path="/tmp/tables",

# -----------------------------------------COMMON DEBUG SETTINGS----------------------------------------------------
debug_mode=DEBUG_MODE,
Expand Down Expand Up @@ -66,20 +65,11 @@ def get_instance(cls: "Configuration") -> "Configuration":

return cls.__instance

def __init_config(self, args: Optional[Any] = None) -> None:
if args is not None and args.config_path is not None:
spec = importlib.util.spec_from_file_location("config_module", args.config_path)
config_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(config_module)
self.__config = config_module._config
else:
def get_config(self) -> dict:
if self.__config is None:
self.__config = _config

def get_config(self, args: Optional[Any] = None) -> dict:
if self.__config is None or args is not None:
self.__init_config(args)
return self.__config


def get_config(args: Optional[Any] = None) -> dict:
return Configuration.get_instance().get_config(args)
def get_config() -> dict:
return Configuration.get_instance().get_config()
5 changes: 3 additions & 2 deletions dedoc/dedoc_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from dedoc.data_structures import ParsedDocument, UnstructuredDocument
from dedoc.manager_config import get_manager_config
from dedoc.metadata_extractors import BaseMetadataExtractor
from dedoc.train_dataset.train_dataset_utils import get_path_original_documents, save_line_with_meta
from dedoc.utils.train_dataset_utils import get_path_original_documents, save_line_with_meta
from dedoc.utils.utils import get_unique_name


Expand Down Expand Up @@ -114,7 +114,7 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str])
self.logger.info(f"Extract structure from file {file_name}")

if self.config.get("labeling_mode", False):
self.__save(os.path.join(tmp_dir, unique_filename), unstructured_document)
self.__save(converted_file_path, unstructured_document)

# Step 5 - Form the output structure
parsed_document = self.structure_constructor.construct(document=unstructured_document, parameters=parameters)
Expand All @@ -141,5 +141,6 @@ def __init_parameters(self, file_path: str, parameters: Optional[dict]) -> dict:
return result_parameters

def __save(self, file_path: str, classified_document: UnstructuredDocument) -> None:
self.logger.info(f'Save document lines to {self.config["intermediate_data_path"]}')
save_line_with_meta(lines=classified_document.lines, config=self.config, original_document=os.path.basename(file_path))
shutil.copy(file_path, os.path.join(get_path_original_documents(self.config), os.path.basename(file_path)))
25 changes: 3 additions & 22 deletions dedoc/main.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,7 @@
import argparse

from dedoc.api.dedoc_api import get_api, run_api # noqa
from dedoc.config import Configuration, get_config


def main() -> None:
run_api(get_api())
from dedoc.config import Configuration


if __name__ == "__main__":
parser_config = argparse.ArgumentParser()
parser_config.add_argument("-c", "--config_path", help="path to configuration file")
parser_config.add_argument("-m", "--module", help="Only for tests")
parser_config.add_argument("-f", "--test_files", metavar="VALUE", nargs="*", help="Only for tests")
parser_config.add_argument("-v", "--unitest_verbose_mode", nargs="?", help="to enable verbose mode of unittest. Only for tests")

args_config = parser_config.parse_args()
Configuration.get_instance().get_config(args_config)
config = get_config()

if config.get("labeling_mode", False):
from api.train_dataset.train_dataset_api import run_special_api # noqa
run_special_api()
else:
main()
Configuration.get_instance().get_config()
run_api(get_api())
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ def text(self) -> str:
return " ".join(word.text for word in self.words if word.text != "") + "\n"

def get_annotations(self, page_width: int, page_height: int, extract_line_bbox: bool) -> List[Annotation]:
if extract_line_bbox:
return [BBoxAnnotation(0, len(" ".join([w.text for w in self.words])), self.bbox, page_width, page_height)]

start = 0
annotations = []

Expand All @@ -35,8 +38,7 @@ def get_annotations(self, page_width: int, page_height: int, extract_line_bbox:
annotations.append(ConfidenceAnnotation(start, end, str(word.confidence / 100)))
annotations.append(BBoxAnnotation(start, end, word.bbox, page_width, page_height))
start += len(word.text) + 1
if extract_line_bbox:
annotations.append(BBoxAnnotation(0, start, self.bbox, page_width, page_height))

return annotations

@staticmethod
Expand Down
4 changes: 0 additions & 4 deletions dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader
from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor
from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox
from dedoc.utils import supported_image_types
from dedoc.utils.parameter_utils import get_path_param
from dedoc.utils.utils import get_mime_extension
Expand Down Expand Up @@ -94,9 +93,6 @@ def _process_one_page(self,
page = self.ocr.split_image2lines(image=clean_image, language=parameters.language, is_one_column_document=is_one_column_document, page_num=page_number)

lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page)
if self.config.get("labeling_mode"):
save_page_with_bbox(page=page, config=self.config, document_name=os.path.basename(path))

return lines, tables, page.attachments, [angle]

def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool, float]:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import logging
import os
import time
from typing import List, Optional, Tuple

import cv2
Expand All @@ -13,7 +14,6 @@
from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.multipage_table_extractor import MultiPageTableExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import OnePageTableExtractor
from dedoc.train_dataset.data_path_config import table_path as save_path

"""-------------------------------------entry class of Table Recognizer Module---------------------------------------"""

Expand Down Expand Up @@ -73,7 +73,7 @@ def __rec_tables_from_img(self,
orient_cell_angle=orient_cell_angle,
table_type=table_type)
if self.config.get("labeling_mode", False):
self.__save_tables(tables=single_page_tables, image=src_image, table_path=save_path)
self.__save_tables(tables=single_page_tables, image=src_image, table_path=self.config.get("table_path", "/tmp/tables"))
if self.table_type.detect_one_cell_table in table_type:
filtered_tables = single_page_tables
else:
Expand Down Expand Up @@ -142,9 +142,9 @@ def __save_tables(self, tables: List[ScanTable], image: np.ndarray, table_path:
image = Image.fromarray(image)
os.makedirs(table_path, exist_ok=True)
for table in tables:
images_cnt = len(os.listdir(table_path))
image_path = os.path.join(table_path, f"{images_cnt:06d}.png")
jsons_path = os.path.join(table_path, f"{images_cnt:06d}.json")
file_name = str(int(time.time()))
image_path = os.path.join(table_path, f"{file_name}.png")
jsons_path = os.path.join(table_path, f"{file_name}.json")
image.save(image_path)
with open(jsons_path, "w") as out:
json.dump(obj=table.to_dict(), fp=out, indent=4, ensure_ascii=False)
12 changes: 8 additions & 4 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
lines = []
page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"])
prev_line = None
labeling_mode = self.config.get("labeling_mode", False)

for block in page["blocks"]:
annotations = []
Expand All @@ -219,9 +220,12 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
for annotation in block["annotations"]:
start = annotation["start"]
end = annotation["end"]
box = BBox(x_top_left=int(annotation["x_top_left"]), y_top_left=int(annotation["y_top_left"]),
width=int(annotation["width"]), height=int(annotation["height"]))
annotations.append(BBoxAnnotation(start, end, box, page_width=page_width, page_height=page_height))

if not labeling_mode:
box = BBox(x_top_left=int(annotation["x_top_left"]), y_top_left=int(annotation["y_top_left"]),
width=int(annotation["width"]), height=int(annotation["height"]))
annotations.append(BBoxAnnotation(start, end, box, page_width=page_width, page_height=page_height))

annotations.append(SizeAnnotation(start, end, str(annotation["font_size"])))
annotations.append(StyleAnnotation(start, end, annotation["font_name"]))

Expand All @@ -235,7 +239,7 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
annotations.append(LinkedTextAnnotation(start, end, annotation["url"]))

bbox = BBox(x_top_left=int(block["x_top_left"]), y_top_left=int(block["y_top_left"]), width=int(block["width"]), height=int(block["height"]))
if self.config.get("labeling_mode", False):
if labeling_mode:
annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height))

meta = block["metadata"].lower()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
from typing import List, Optional, Tuple

import numpy as np
Expand All @@ -10,7 +9,6 @@
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor
from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox
from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer
from dedoc.utils.utils import get_mime_extension

Expand Down Expand Up @@ -66,9 +64,6 @@ def _process_one_page(self,
lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)

if self.config.get("labeling_mode"):
save_page_with_bbox(page=page, config=self.config, document_name=os.path.basename(path))

return lines, tables, page.attachments, []

def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

class PdfminerExtractor(object):
"""
Class extarcts text with style from pdf with help pdfminer.six
Class extracts text with style from pdf with help pdfminer.six
"""

def __init__(self, *, config: dict) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from xgboost import XGBClassifier


from dedoc.download_models import download_from_hub
from dedoc.structure_extractors.line_type_classifiers.abstract_line_type_classifier import AbstractLineTypeClassifier
from dedoc.utils.parameter_utils import get_param_gpu_available
Expand Down
1 change: 0 additions & 1 deletion dedoc/train_dataset/data_path_config.py

This file was deleted.

Loading

0 comments on commit 0b7ea01

Please sign in to comment.