Skip to content

Commit 953c812

Browse files
committed
TLDR-444 fixed after review
1 parent 714ad98 commit 953c812

File tree

2 files changed

+15
-14
lines changed

2 files changed

+15
-14
lines changed

dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
from pdfminer.pdfpage import PDFPage
1818

1919
from dedoc.common.exceptions.bad_file_error import BadFileFormatError
20-
from dedoc.data_structures import BBoxAnnotation
2120
from dedoc.data_structures.annotation import Annotation
21+
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
2222
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
2323
from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation
2424
from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation
@@ -175,7 +175,6 @@ def __get_line_annotations(self, lobj: LTTextLineHorizontal, k_w: float, k_h: fl
175175
chars_with_style = []
176176
rand_weight = self._get_new_weight()
177177
prev_style = ""
178-
annotations: List[Annotation]
179178

180179
for lobj_char in lobj:
181180
if isinstance(lobj_char, LTChar) or isinstance(lobj_char, LTAnno):
@@ -207,7 +206,7 @@ def __get_line_annotations(self, lobj: LTTextLineHorizontal, k_w: float, k_h: fl
207206

208207
return annotations
209208

210-
def __extract_words_bbox_annotation(self, lobj: LTTextContainer, k_w: float, k_h: float, height: int, width: int) -> List[BBoxAnnotation]:
209+
def __extract_words_bbox_annotation(self, lobj: LTTextContainer, k_w: float, k_h: float, height: int, width: int) -> List[Annotation]:
211210
words: List[WordObj] = []
212211
word: WordObj = WordObj(start=0, end=0, value=LTTextContainer())
213212
if isinstance(lobj, LTTextLineHorizontal):
@@ -240,11 +239,13 @@ def __parse_style_string(self, chars_with_meta: str, begin: int, end: int) -> Li
240239
font, size, *_ = prev_style.split("_")
241240
fontname_wo_rand = font.split("+")[-1]
242241
styles = fontname_wo_rand.split("-")[-1]
242+
annotations.append(StyleAnnotation(begin, end, value=fontname_wo_rand))
243+
243244
if "Bold" in styles:
244245
annotations.append(BoldAnnotation(begin, end, value="True"))
245246
if "Italic" in styles:
246247
annotations.append(ItalicAnnotation(begin, end, value="True"))
247-
annotations.append(StyleAnnotation(begin, end, value=fontname_wo_rand))
248+
248249
if size.replace(".", "", 1).isnumeric():
249250
annotations.append(SizeAnnotation(begin, end, value=size))
250251

dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_utils.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
from pdfminer.layout import LTContainer
88
from pdfminer.pdfpage import PDFPage
99

10-
from dedoc.data_structures import BBox, BBoxAnnotation
10+
from dedoc.data_structures.bbox import BBox
11+
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
1112

1213

1314
def draw_layout_element(image_src: np.ndarray,
@@ -39,18 +40,17 @@ def draw_annotation(image: np.ndarray, annotations: List[BBoxAnnotation]) -> Non
3940

4041

4142
def convert_coordinates_pdf_to_image(lobj: LTContainer, k_w: float, k_h: float, height_page: int) -> BBox:
42-
x0_new = int(lobj.x0 * k_w)
43-
x1_new = int(lobj.x1 * k_w)
44-
y0_new = int((height_page - lobj.y1) * k_h)
45-
y1_new = int((height_page - lobj.y0) * k_h)
43+
x0 = int(lobj.x0 * k_w)
44+
x1 = int(lobj.x1 * k_w)
45+
y0 = int((height_page - lobj.y1) * k_h)
46+
y1 = int((height_page - lobj.y0) * k_h)
4647

47-
return BBox(x0_new, y0_new, x1_new - x0_new, y1_new - y0_new)
48+
return BBox(x0, y0, x1 - x0, y1 - y0)
4849

4950

5051
def create_bbox(height: int, k_h: float, k_w: float, lobj: LTContainer) -> BBox:
5152
curr_box_line = convert_coordinates_pdf_to_image(lobj, k_w, k_h, height)
52-
bbox = BBox.from_two_points((curr_box_line.x_top_left, curr_box_line.y_top_left),
53-
(curr_box_line.x_bottom_right, curr_box_line.y_bottom_right))
53+
bbox = BBox.from_two_points((curr_box_line.x_top_left, curr_box_line.y_top_left), (curr_box_line.x_bottom_right, curr_box_line.y_bottom_right))
5454
return bbox
5555

5656

@@ -60,10 +60,10 @@ def cleaning_text_from_hieroglyphics(text_str: str) -> str:
6060
:param text_str: text
6161
:return: text wo cids-chars
6262
"""
63-
return re.sub(r"\(cid:(\d)*\)", cid_recognized, text_str)
63+
return re.sub(r"\(cid:(\d)*\)", cid_to_ascii_text, text_str)
6464

6565

66-
def cid_recognized(m: Match) -> str:
66+
def cid_to_ascii_text(m: Match) -> str:
6767
v = m.group(0)
6868
v = v.strip("(").strip(")")
6969
ascii_num = v.split(":")[-1]

0 commit comments

Comments
 (0)