diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index 8b635ad7..64d96fe6 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -3,6 +3,7 @@ from numpy import ndarray +from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable @@ -53,6 +54,9 @@ def __init__(self, *, config: Optional[dict] = None) -> None: self.binarizer = AdaptiveBinarizer() self.ocr = OCRLineExtractor(config=self.config) + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: + return super().read(file_path, parameters) + def _process_one_page(self, image: ndarray, parameters: ParametersForParseDoc, diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index d277815b..4cebbaf4 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -3,6 +3,7 @@ from dedocutils.data_structures import BBox from numpy import ndarray +from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable @@ -37,6 +38,9 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer return super().can_read(file_path=file_path, mime=mime, extension=extension) and get_param_pdf_with_txt_layer(parameters) == "true" + def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: + return super().read(file_path, parameters) + def _process_one_page(self, image: ndarray, parameters: ParametersForParseDoc, diff --git a/docs/source/_static/gost_frame_data/document_with_gost_frame.pdf b/docs/source/_static/gost_frame_data/document_with_gost_frame.pdf new file mode 100644 index 00000000..295df746 Binary files /dev/null and b/docs/source/_static/gost_frame_data/document_with_gost_frame.pdf differ diff --git a/docs/source/_static/page_with_gost_frame_1.png b/docs/source/_static/gost_frame_data/page_with_gost_frame_1.png similarity index 100% rename from docs/source/_static/page_with_gost_frame_1.png rename to docs/source/_static/gost_frame_data/page_with_gost_frame_1.png diff --git a/docs/source/_static/page_with_gost_frame_2.png b/docs/source/_static/gost_frame_data/page_with_gost_frame_2.png similarity index 100% rename from docs/source/_static/page_with_gost_frame_2.png rename to docs/source/_static/gost_frame_data/page_with_gost_frame_2.png diff --git a/docs/source/_static/result_gost_frame.png b/docs/source/_static/gost_frame_data/result_gost_frame.png similarity index 100% rename from docs/source/_static/result_gost_frame.png rename to docs/source/_static/gost_frame_data/result_gost_frame.png diff --git a/docs/source/parameters/gost_frame_handling.rst b/docs/source/parameters/gost_frame_handling.rst index 1d48cf63..7093b799 100644 --- a/docs/source/parameters/gost_frame_handling.rst +++ b/docs/source/parameters/gost_frame_handling.rst @@ -18,16 +18,16 @@ GOST frame handling - True, False - False - * :meth:`dedoc.DedocManager.parse` - * method :meth:`~dedoc.readers.BaseReader.read` of inheritors of :class:`~dedoc.readers.BaseReader` - * :meth:`dedoc.readers.PdfTabbyReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTabbyReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to enable GOST (Russian government standard "ГОСТ Р 21.1101") frame recognition for PDF documents or images. The content of each page of some technical documents is placed in special GOST frames. An example of GOST frames is shown in the example below (:ref:`example_gost_frame`). -Such frames contain meta-information and are not part of the text content of the document.Based on this, we have implemented the functionality for ignoring GOST frames in documents, which works for: +Such frames contain meta-information and are not part of the text content of the document. Based on this, we have implemented the functionality for ignoring GOST frames in documents, which works for: - * Copyable and non-copyable PDF documents (:class:`dedoc.readers.PdfTxtlayerReader` and :class:`dedoc.readers.PdfTabbyReader`); - * Images (:class:`dedoc.readers.PdfImageReader`). + * Copyable PDF documents (:class:`dedoc.readers.PdfTxtlayerReader` and :class:`dedoc.readers.PdfTabbyReader`); + * Non-copyable PDF documents and Images (:class:`dedoc.readers.PdfImageReader`). If parameter ``need_gost_frame_analysis=True``, the GOST frame itself is ignored and only the contents inside the frame are extracted. @@ -35,11 +35,11 @@ If parameter ``need_gost_frame_analysis=True``, the GOST frame itself is ignored Examples of GOST frame ---------------------- -For example your send PDF-document with two pages: +For example, your send PDF-document with two pages :download:`PDF-document with two pages <../_static/gost_frame_data/document_with_gost_frame.pdf>`: -.. image:: ../_static/page_with_gost_frame_1.png +.. image:: ../_static/gost_frame_data/page_with_gost_frame_1.png :width: 30% -.. image:: ../_static/page_with_gost_frame_2.png +.. image:: ../_static/gost_frame_data/page_with_gost_frame_2.png :width: 30% Parameter's usage @@ -62,7 +62,5 @@ Parameter's usage Request's result ---------------- -.. image:: ../_static/result_gost_frame.png +.. image:: ../_static/gost_frame_data/result_gost_frame.png :width: 50% - - diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst index 05c1db04..853c9b97 100644 --- a/docs/source/parameters/pdf_handling.rst +++ b/docs/source/parameters/pdf_handling.rst @@ -62,7 +62,7 @@ PDF and images handling - rus, eng, rus+eng, fra, spa - rus+eng - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` * :meth:`dedoc.structure_extractors.FintocStructureExtractor.extract` - Language of the document without a textual layer. The following values are available: @@ -77,7 +77,7 @@ PDF and images handling - :, start:, :end, start:end - : - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`, :meth:`dedoc.readers.PdfTabbyReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfTabbyReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - If you need to read a part of the PDF document, you can use page slice to define the reading range. If the range is set like ``start_page:end_page``, document will be processed from ``start_page`` to ``end_page`` @@ -96,7 +96,7 @@ PDF and images handling - true, false, auto - auto - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to set the number of columns if the PDF document is without a textual layer in case it's known beforehand. The following values are available: @@ -111,7 +111,7 @@ PDF and images handling - auto, no_change - auto - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to control document orientation analysis for PDF documents without a textual layer. The following values are available: @@ -125,7 +125,7 @@ PDF and images handling - True, False - False - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to **remove** headers and footers of PDF documents from the output result. If ``need_header_footer_analysis=False``, header and footer lines will present in the output as well as all other document lines. @@ -134,7 +134,7 @@ PDF and images handling - True, False - False - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to clean background (binarize) for pages of PDF documents without a textual layer. If the document's background is heterogeneous, this option may help to improve the result of document text recognition. @@ -144,7 +144,7 @@ PDF and images handling - True, False - True - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to enable table recognition for PDF documents or images. The table recognition method is used in :class:`dedoc.readers.PdfImageReader` and :class:`dedoc.readers.PdfTxtlayerReader`. @@ -155,18 +155,17 @@ PDF and images handling - True, False - False - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTabbyReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images. - The GOST frame recognizer is used in :meth:`dedoc.readers.PdfBaseReader.read`. Its main function is to recognize and - ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader`, :class:`dedoc.readers.PdfTxtlayerReader` - and :class:`dedoc.readers.PdfTabbyReader` to properly process the content of the document containing GOST frame, see :ref:`gost_frame_handling` for more details + It allows :class:`dedoc.readers.PdfImageReader`, :class:`dedoc.readers.PdfTxtlayerReader` and :class:`dedoc.readers.PdfTabbyReader` + to properly process the content of the document containing GOST frame, see :ref:`gost_frame_handling` for more details. * - orient_analysis_cells - True, False - False - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used for a table recognition for PDF documents or images. It is ignored when ``need_pdf_table_analysis=False``. @@ -177,7 +176,7 @@ PDF and images handling - 90, 270 - 90 - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used for a table recognition for PDF documents or images. It is ignored when ``need_pdf_table_analysis=False`` or ``orient_analysis_cells=False``.