Skip to content

Commit

Permalink
TLDR-420 word bbox annotation (#307)
Browse files Browse the repository at this point in the history
* add bbox annotations for words

* update tests

* fix lint

* fix confidence annotation and save relative bbox in BBoxAnnotation

* add docstrings for page_width and page_height
  • Loading branch information
dronperminov authored Aug 22, 2023
1 parent 0f811ed commit e9a877e
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 16 deletions.
10 changes: 10 additions & 0 deletions dedoc/data_structures/bbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,16 @@ def to_dict(self) -> dict:
res["height"] = self.height
return res

def to_relative_dict(self, page_width: int, page_height: int) -> dict:
res = OrderedDict()
res["x_top_left"] = self.x_top_left / page_width
res["y_top_left"] = self.y_top_left / page_height
res["width"] = self.width / page_width
res["height"] = self.height / page_height
res["page_width"] = page_width
res["page_height"] = page_height
return res

@staticmethod
def from_dict(some_dict: Dict[str, int]) -> "BBox":
return BBox(**some_dict)
15 changes: 8 additions & 7 deletions dedoc/data_structures/concrete_annotations/bbox_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,22 @@

class BBoxAnnotation(Annotation):
"""
Coordinates of the line's bounding box (in pixels) - for pdf documents.
Coordinates of the line's bounding box (in relative coordinates) - for pdf documents.
"""
name = "bounding box"

def __init__(self, start: int, end: int, value: BBox) -> None:
def __init__(self, start: int, end: int, value: BBox, page_width: int, page_height: int) -> None:
"""
:param start: start of the annotated text (usually zero)
:param end: end of the annotated text (usually end of the line)
:param value: bounding box where line is located
:param page_width: width of original image with this bbox
:param page_height: height of original image with this bbox
"""
try:
BBox(value.x_top_left, value.y_top_left, value.width, value.height)
except ValueError:
if not isinstance(value, BBox):
raise ValueError("the value of bounding box annotation should be instance of BBox")
super().__init__(start=start, end=end, name=BBoxAnnotation.name, value=json.dumps(value.to_dict()))

super().__init__(start=start, end=end, name=BBoxAnnotation.name, value=json.dumps(value.to_relative_dict(page_width, page_height)))

@staticmethod
def get_api_dict(api: Api) -> Model:
Expand All @@ -31,5 +32,5 @@ def get_api_dict(api: Api) -> Model:
"end": fields.Integer(description="annotation end index", required=True, example=4),
"value": fields.String(description="bounding box of text chunk",
required=True,
example='{"x_top_left": 0, "y_top_left": 0, "width": 70, "height": 20}')
example='{"x_top_left": 0, "y_top_left": 0, "width": 0.5, "height": 0.2, "page_width": 1000, "page_height": 400}')
})
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@ def __init__(self, start: int, end: int, value: str) -> None:
"""
:param start: start of the text
:param end: end of the text (not included)
:param value: confidence level in "percents" (float or integer number from 0 to 100)
:param value: confidence level in "percents" (float number from 0 to 1)
"""
try:
assert 0.0 <= float(value) <= 100.0
assert 0.0 <= float(value) <= 1.0
except ValueError:
raise ValueError("the value of confidence annotation should be float value")
except AssertionError:
raise ValueError("the value of confidence annotation should be in range [0, 100]")
raise ValueError("the value of confidence annotation should be in range [0, 1]")
super().__init__(start=start, end=end, name=ConfidenceAnnotation.name, value=value, is_mergeable=False)

@staticmethod
Expand Down
3 changes: 0 additions & 3 deletions dedoc/readers/pdf_reader/data_classes/text_with_bbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.bbox import BBox
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedoc.data_structures.serializable import Serializable


Expand All @@ -24,8 +23,6 @@ def __init__(self,
self.text = text
self.label = label
self.annotations = [] if annotations is None else annotations
if BBoxAnnotation.name not in [annotation.name for annotation in self.annotations]:
self.annotations.append(BBoxAnnotation(start=0, end=len(text), value=bbox))
self.uid = f"bbox_{uuid1()}" if uid is None else uid

def __str__(self) -> str:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@ def __split_image2bboxes(self, image: np.ndarray, page_num: int, language: str,
output_dict = get_text_with_bbox_from_document_page(image, language, ocr_conf_thr)
else:
output_dict = get_text_with_bbox_from_cells(image, language, ocr_conf_threshold=0.0)
line_boxes = [TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_confidence())

height, width = image.shape[:2]
line_boxes = [TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations(width, height))
for line_num, line in enumerate(output_dict.lines)]

return line_boxes
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import List

from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.bbox import BBox
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_tuple import OcrElement
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_word import OcrWord
Expand All @@ -20,17 +22,20 @@ def __init__(self, order: int, bbox: BBox, words: List[OcrWord]) -> None:
def text(self) -> str:
return " ".join(word.text for word in self.words if word.text != "") + "\n"

def get_confidence(self) -> List[ConfidenceAnnotation]:
def get_annotations(self, page_width: int, page_height: int) -> List[Annotation]:
start = 0
annotations = []

for word in self.words:
if word.text == "":
continue

annotations.append(ConfidenceAnnotation(start, start + len(word.text), str(word.confidence)))
end = start + len(word.text)
annotations.append(ConfidenceAnnotation(start, end, str(word.confidence / 100)))
annotations.append(BBoxAnnotation(start, end, word.bbox, page_width, page_height))
start += len(word.text) + 1

annotations.append(BBoxAnnotation(0, start, self.bbox, page_width, page_height))
return annotations

@staticmethod
Expand Down
2 changes: 2 additions & 0 deletions tests/api_tests/test_api_format_pdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os

from dedoc.data_structures import BBoxAnnotation
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation
from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation
Expand All @@ -20,6 +21,7 @@ def __check_example_file(self, result: dict) -> None:
self.assertIn(BoldAnnotation.name, annotation_names)
self.assertIn(SpacingAnnotation.name, annotation_names)
self.assertIn(ConfidenceAnnotation.name, annotation_names)
self.assertIn(BBoxAnnotation.name, annotation_names)
self._check_similarity("1.2.1 Поясним за непонятное", content[3]["subparagraphs"][0]["text"])

def __check_metainfo(self, metainfo: dict, actual_type: str, actual_name: str) -> None:
Expand Down

0 comments on commit e9a877e

Please sign in to comment.