diff --git a/dedoc/data_structures/bbox.py b/dedoc/data_structures/bbox.py index dfdf34b0..05759c92 100644 --- a/dedoc/data_structures/bbox.py +++ b/dedoc/data_structures/bbox.py @@ -102,6 +102,16 @@ def to_dict(self) -> dict: res["height"] = self.height return res + def to_relative_dict(self, page_width: int, page_height: int) -> dict: + res = OrderedDict() + res["x_top_left"] = self.x_top_left / page_width + res["y_top_left"] = self.y_top_left / page_height + res["width"] = self.width / page_width + res["height"] = self.height / page_height + res["page_width"] = page_width + res["page_height"] = page_height + return res + @staticmethod def from_dict(some_dict: Dict[str, int]) -> "BBox": return BBox(**some_dict) diff --git a/dedoc/data_structures/concrete_annotations/bbox_annotation.py b/dedoc/data_structures/concrete_annotations/bbox_annotation.py index fdeb145c..2bc0dac3 100644 --- a/dedoc/data_structures/concrete_annotations/bbox_annotation.py +++ b/dedoc/data_structures/concrete_annotations/bbox_annotation.py @@ -8,21 +8,22 @@ class BBoxAnnotation(Annotation): """ - Coordinates of the line's bounding box (in pixels) - for pdf documents. + Coordinates of the line's bounding box (in relative coordinates) - for pdf documents. """ name = "bounding box" - def __init__(self, start: int, end: int, value: BBox) -> None: + def __init__(self, start: int, end: int, value: BBox, page_width: int, page_height: int) -> None: """ :param start: start of the annotated text (usually zero) :param end: end of the annotated text (usually end of the line) :param value: bounding box where line is located + :param page_width: width of original image with this bbox + :param page_height: height of original image with this bbox """ - try: - BBox(value.x_top_left, value.y_top_left, value.width, value.height) - except ValueError: + if not isinstance(value, BBox): raise ValueError("the value of bounding box annotation should be instance of BBox") - super().__init__(start=start, end=end, name=BBoxAnnotation.name, value=json.dumps(value.to_dict())) + + super().__init__(start=start, end=end, name=BBoxAnnotation.name, value=json.dumps(value.to_relative_dict(page_width, page_height))) @staticmethod def get_api_dict(api: Api) -> Model: @@ -31,5 +32,5 @@ def get_api_dict(api: Api) -> Model: "end": fields.Integer(description="annotation end index", required=True, example=4), "value": fields.String(description="bounding box of text chunk", required=True, - example='{"x_top_left": 0, "y_top_left": 0, "width": 70, "height": 20}') + example='{"x_top_left": 0, "y_top_left": 0, "width": 0.5, "height": 0.2, "page_width": 1000, "page_height": 400}') }) diff --git a/dedoc/data_structures/concrete_annotations/confidence_annotation.py b/dedoc/data_structures/concrete_annotations/confidence_annotation.py index d7977935..b7b7ad65 100644 --- a/dedoc/data_structures/concrete_annotations/confidence_annotation.py +++ b/dedoc/data_structures/concrete_annotations/confidence_annotation.py @@ -13,14 +13,14 @@ def __init__(self, start: int, end: int, value: str) -> None: """ :param start: start of the text :param end: end of the text (not included) - :param value: confidence level in "percents" (float or integer number from 0 to 100) + :param value: confidence level in "percents" (float number from 0 to 1) """ try: - assert 0.0 <= float(value) <= 100.0 + assert 0.0 <= float(value) <= 1.0 except ValueError: raise ValueError("the value of confidence annotation should be float value") except AssertionError: - raise ValueError("the value of confidence annotation should be in range [0, 100]") + raise ValueError("the value of confidence annotation should be in range [0, 1]") super().__init__(start=start, end=end, name=ConfidenceAnnotation.name, value=value, is_mergeable=False) @staticmethod diff --git a/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py b/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py index 30cb50d2..a6c7ed40 100644 --- a/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py +++ b/dedoc/readers/pdf_reader/data_classes/text_with_bbox.py @@ -4,7 +4,6 @@ from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.bbox import BBox -from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation from dedoc.data_structures.serializable import Serializable @@ -24,8 +23,6 @@ def __init__(self, self.text = text self.label = label self.annotations = [] if annotations is None else annotations - if BBoxAnnotation.name not in [annotation.name for annotation in self.annotations]: - self.annotations.append(BBoxAnnotation(start=0, end=len(text), value=bbox)) self.uid = f"bbox_{uuid1()}" if uid is None else uid def __str__(self) -> str: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py index 3d10cf89..b7e1d520 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py @@ -64,7 +64,9 @@ def __split_image2bboxes(self, image: np.ndarray, page_num: int, language: str, output_dict = get_text_with_bbox_from_document_page(image, language, ocr_conf_thr) else: output_dict = get_text_with_bbox_from_cells(image, language, ocr_conf_threshold=0.0) - line_boxes = [TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_confidence()) + + height, width = image.shape[:2] + line_boxes = [TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations(width, height)) for line_num, line in enumerate(output_dict.lines)] return line_boxes diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py index 64181a50..9b78f7d0 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py @@ -1,6 +1,8 @@ from typing import List +from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.bbox import BBox +from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_tuple import OcrElement from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_word import OcrWord @@ -20,7 +22,7 @@ def __init__(self, order: int, bbox: BBox, words: List[OcrWord]) -> None: def text(self) -> str: return " ".join(word.text for word in self.words if word.text != "") + "\n" - def get_confidence(self) -> List[ConfidenceAnnotation]: + def get_annotations(self, page_width: int, page_height: int) -> List[Annotation]: start = 0 annotations = [] @@ -28,9 +30,12 @@ def get_confidence(self) -> List[ConfidenceAnnotation]: if word.text == "": continue - annotations.append(ConfidenceAnnotation(start, start + len(word.text), str(word.confidence))) + end = start + len(word.text) + annotations.append(ConfidenceAnnotation(start, end, str(word.confidence / 100))) + annotations.append(BBoxAnnotation(start, end, word.bbox, page_width, page_height)) start += len(word.text) + 1 + annotations.append(BBoxAnnotation(0, start, self.bbox, page_width, page_height)) return annotations @staticmethod diff --git a/tests/api_tests/test_api_format_pdf.py b/tests/api_tests/test_api_format_pdf.py index 5f766472..afd29131 100644 --- a/tests/api_tests/test_api_format_pdf.py +++ b/tests/api_tests/test_api_format_pdf.py @@ -1,5 +1,6 @@ import os +from dedoc.data_structures import BBoxAnnotation from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation @@ -20,6 +21,7 @@ def __check_example_file(self, result: dict) -> None: self.assertIn(BoldAnnotation.name, annotation_names) self.assertIn(SpacingAnnotation.name, annotation_names) self.assertIn(ConfidenceAnnotation.name, annotation_names) + self.assertIn(BBoxAnnotation.name, annotation_names) self._check_similarity("1.2.1 Поясним за непонятное", content[3]["subparagraphs"][0]["text"]) def __check_metainfo(self, metainfo: dict, actual_type: str, actual_name: str) -> None: