diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py index 3d10cf89..3de15ef2 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py @@ -64,7 +64,7 @@ def __split_image2bboxes(self, image: np.ndarray, page_num: int, language: str, output_dict = get_text_with_bbox_from_document_page(image, language, ocr_conf_thr) else: output_dict = get_text_with_bbox_from_cells(image, language, ocr_conf_threshold=0.0) - line_boxes = [TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_confidence()) + line_boxes = [TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations()) for line_num, line in enumerate(output_dict.lines)] return line_boxes diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py index 64181a50..dbcfa8db 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py @@ -2,6 +2,8 @@ from dedoc.data_structures.bbox import BBox from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation +from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation +from dedoc.data_structures.annotation import Annotation from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_tuple import OcrElement from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_word import OcrWord @@ -20,7 +22,7 @@ def __init__(self, order: int, bbox: BBox, words: List[OcrWord]) -> None: def text(self) -> str: return " ".join(word.text for word in self.words if word.text != "") + "\n" - def get_confidence(self) -> List[ConfidenceAnnotation]: + def get_annotations(self) -> List[Annotation]: start = 0 annotations = [] @@ -28,9 +30,12 @@ def get_confidence(self) -> List[ConfidenceAnnotation]: if word.text == "": continue - annotations.append(ConfidenceAnnotation(start, start + len(word.text), str(word.confidence))) + end = start + len(word.text) + annotations.append(ConfidenceAnnotation(start, end, str(word.confidence))) + annotations.append(BBoxAnnotation(start, end, word.bbox)) start += len(word.text) + 1 + annotations.append(BBoxAnnotation(0, start, self.bbox)) return annotations @staticmethod