Skip to content

Commit

Permalink
extraction: move max_tree_size parameter to settings.cfg (#742)
Browse files Browse the repository at this point in the history
* extraction: move max_tree_size parameter to settings.cfg

* fix tests
  • Loading branch information
adbar authored Nov 11, 2024
1 parent b108253 commit ec83ff7
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 11 deletions.
15 changes: 10 additions & 5 deletions tests/filters_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from trafilatura import extract
from trafilatura.metadata import Document
from trafilatura.settings import DEFAULT_CONFIG
from trafilatura.settings import DEFAULT_CONFIG, Extractor
from trafilatura.utils import LANGID_FLAG, check_html_lang, language_filter


Expand Down Expand Up @@ -35,17 +35,22 @@ def test_filters():
assert language_filter('Hier ist ein Text.', '', 'en', SAMPLE_META)[0] is False
# test URL blacklist
assert extract('<html><head><link rel="canonical" href="https://example.org"/></head><body></body></html>', output_format='xml', url_blacklist={'https://example.org'}) is None

## recursion limit
options = Extractor()
options.max_tree_size = 500
my_p = '<p>abc</p>'
doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>')
assert extract(doc, max_tree_size=500) is not None
assert extract(doc, options=options) is not None
doc = html.fromstring('<html><body>' + my_p*501 + '</body></html>')
assert extract(doc, max_tree_size=500) is None
assert extract(doc, options=options) is None

options.formatting = True
my_p = '<p><hi rend="#i">abc</hi></p>'
doc = html.fromstring('<html><body>' + my_p*501 + '</body></html>')
assert extract(doc, include_formatting=True, max_tree_size=500) is None
assert extract(doc, options=options) is None
doc = html.fromstring('<html><body>' + my_p*499 + '</body></html>')
assert extract(doc, include_formatting=True, max_tree_size=500) is not None
assert extract(doc, options=options) is not None

# HTML lang filter
# no lang
Expand Down
4 changes: 4 additions & 0 deletions tests/resources/zerolength.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ MIN_OUTPUT_SIZE = 0
MIN_OUTPUT_COMM_SIZE = 0


# discard documents with too many elements
MAX_TREE_SIZE = 100


# Set to 0 to disable signal
EXTRACTION_TIMEOUT = 0

Expand Down
4 changes: 4 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1566,6 +1566,10 @@ def test_deprecations():
assert extract(htmlstring, no_fallback=True, config=ZERO_CONFIG) is not None
assert bare_extraction(htmlstring, no_fallback=True, config=ZERO_CONFIG) is not None
assert bare_extraction(htmlstring, as_dict=True, config=ZERO_CONFIG) is not None
with pytest.raises(ValueError):
extract(htmlstring, max_tree_size=100)
with pytest.raises(ValueError):
bare_extraction(htmlstring, max_tree_size=100)



Expand Down
9 changes: 5 additions & 4 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,6 @@ def bare_extraction(
with_metadata: Extract metadata fields and add them to the output.
only_with_metadata: Only keep documents featuring all essential metadata
(date, title, url).
max_tree_size: Discard documents with too many elements.
url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
as_dict: Will be deprecated, use the .as_dict() method of the document class.
Expand Down Expand Up @@ -205,6 +204,8 @@ def bare_extraction(
'"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results',
PendingDeprecationWarning
)
if max_tree_size:
raise ValueError("max_tree_size is deprecated, use settings.cfg file instead")

# regroup extraction options
if not options or not isinstance(options, Extractor):
Expand All @@ -221,7 +222,6 @@ def bare_extraction(
tables=include_tables,
dedup=deduplicate,
lang=target_language,
max_tree_size=max_tree_size,
url=url,
with_metadata=with_metadata,
only_with_metadata=only_with_metadata,
Expand Down Expand Up @@ -412,7 +412,6 @@ def extract(
with_metadata: Extract metadata fields and add them to the output.
only_with_metadata: Only keep documents featuring all essential metadata
(date, title, url).
max_tree_size: Discard documents with too many elements.
url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
settingsfile: Use a configuration file to override the standard settings.
Expand All @@ -432,6 +431,9 @@ def extract(
PendingDeprecationWarning
)

if max_tree_size:
raise ValueError("max_tree_size is deprecated, use settings.cfg file instead")

# regroup extraction options
if not options or not isinstance(options, Extractor):
options = Extractor(
Expand All @@ -447,7 +449,6 @@ def extract(
tables=include_tables,
dedup=deduplicate,
lang=target_language,
max_tree_size=max_tree_size,
url=url,
with_metadata=with_metadata,
only_with_metadata=only_with_metadata,
Expand Down
4 changes: 4 additions & 0 deletions trafilatura/settings.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ MIN_OUTPUT_SIZE = 1
MIN_OUTPUT_COMM_SIZE = 1


# discard documents with too many elements
MAX_TREE_SIZE =


# CLI file processing only, set to 0 to disable
EXTRACTION_TIMEOUT = 30

Expand Down
3 changes: 1 addition & 2 deletions trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def __init__(
tables: bool = True,
dedup: bool = False,
lang: Optional[str] = None,
max_tree_size: Optional[int] = None,
url: Optional[str] = None,
source: Optional[str] = None,
with_metadata: bool = False,
Expand All @@ -137,7 +136,6 @@ def __init__(
self.tables: bool = tables
self.dedup: bool = dedup
self.lang: Optional[str] = lang
self.max_tree_size: Optional[int] = max_tree_size
self.url: Optional[str] = url
self.only_with_metadata: bool = only_with_metadata
self.tei_validation: bool = tei_validation
Expand All @@ -152,6 +150,7 @@ def __init__(
self.date_params: Dict[str, Any] = date_params or set_date_params(
self.config.getboolean("DEFAULT", "EXTENSIVE_DATE_SEARCH")
)
self.max_tree_size = None

def _set_source(self, url: Optional[str], source: Optional[str]) -> None:
"Set the source attribute in a robust way."
Expand Down

0 comments on commit ec83ff7

Please sign in to comment.