From 828355347b7000bae57ed6720d12ea0165f2c5c5 Mon Sep 17 00:00:00 2001 From: Edouard Belval Date: Tue, 5 Mar 2024 21:22:43 +0000 Subject: [PATCH] Add configuration option to hide key-value layouts --- textractor/data/text_linearization_config.py | 10 ++++++---- textractor/entities/layout.py | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/textractor/data/text_linearization_config.py b/textractor/data/text_linearization_config.py index 9c86099c..b785ba8e 100644 --- a/textractor/data/text_linearization_config.py +++ b/textractor/data/text_linearization_config.py @@ -14,13 +14,15 @@ class TextLinearizationConfig: max_number_of_consecutive_spaces: int = None #: Removes extra whitespace (None skips whitespace removal) - hide_header_layout: bool = False #: Hide headers in the linearized output + hide_header_layout: bool = False #: Hide headers layouts in the linearized output - hide_footer_layout: bool = False #: Hide footers in the linearized output + hide_footer_layout: bool = False #: Hide footers layouts in the linearized output - hide_figure_layout: bool = False #: Hide figures in the linearized output + hide_figure_layout: bool = False #: Hide figures layouts in the linearized output - hide_table_layout: bool = False #: Hide tables in the linearized output + hide_table_layout: bool = False #: Hide tables layouts in the linearized output + + hide_key_value_layout: bool = False #: Hide key-value layouts in the linearized output hide_page_num_layout: bool = False #: Hide page numbers in the linearized output diff --git a/textractor/entities/layout.py b/textractor/entities/layout.py index f8fb1202..3571cc48 100644 --- a/textractor/entities/layout.py +++ b/textractor/entities/layout.py @@ -110,6 +110,7 @@ def get_text_and_words( or (self.layout_type == LAYOUT_FIGURE and config.hide_figure_layout) or (self.layout_type == LAYOUT_PAGE_NUMBER and config.hide_page_num_layout) or (self.layout_type == LAYOUT_TABLE and config.hide_table_layout) + or (self.layout_type == LAYOUT_KEY_VALUE and config.hide_key_value_layout) ): return "", [] elif self.layout_type == LAYOUT_PAGE_NUMBER: