Skip to content

Commit

Permalink
TLDR-444 added words bbox pdfminer (#313)
Browse files Browse the repository at this point in the history
* TLDR-444 added word supporting into pdfminer-reader

* TLDR-444 added word extraction from pdfminer; pdfminer refactoring

* TLDR-444 added tests (word bounding box)

* TLDR-444 fixed code style

* TLDR-444 fixed after review
  • Loading branch information
oksidgy authored and sunveil committed Aug 29, 2023
1 parent d4c3d34 commit 436c7f8
Show file tree
Hide file tree
Showing 6 changed files with 233 additions and 162 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.extractor_pdf_textlayer import ExtractorPdfTextLayer
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor
from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox


Expand All @@ -25,7 +25,7 @@ def __init__(self, *, config: dict) -> None:
:param config: configuration of the reader, e.g. logger for logging
"""
super().__init__(config=config)
self.extractor_layer = ExtractorPdfTextLayer(config=config)
self.extractor_layer = PdfminerExtractor(config=config)

def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
"""
Expand Down
Empty file.
Loading

0 comments on commit 436c7f8

Please sign in to comment.