'
doc = html.fromstring('' + my_p*50 + '')
- assert extract(doc, max_tree_size=500) is not None
+ assert extract(doc, options=options) is not None
doc = html.fromstring('' + my_p*501 + '')
- assert extract(doc, max_tree_size=500) is None
+ assert extract(doc, options=options) is None
+
+ options.formatting = True
my_p = '
abc
'
doc = html.fromstring('' + my_p*501 + '')
- assert extract(doc, include_formatting=True, max_tree_size=500) is None
+ assert extract(doc, options=options) is None
doc = html.fromstring('' + my_p*499 + '')
- assert extract(doc, include_formatting=True, max_tree_size=500) is not None
+ assert extract(doc, options=options) is not None
# HTML lang filter
# no lang
diff --git a/tests/resources/zerolength.cfg b/tests/resources/zerolength.cfg
index f20cdbc4..2bc9da85 100644
--- a/tests/resources/zerolength.cfg
+++ b/tests/resources/zerolength.cfg
@@ -29,6 +29,10 @@ MIN_OUTPUT_SIZE = 0
MIN_OUTPUT_COMM_SIZE = 0
+# discard documents with too many elements
+MAX_TREE_SIZE = 100
+
+
# Set to 0 to disable signal
EXTRACTION_TIMEOUT = 0
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index f362f0a3..00046804 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -1566,6 +1566,10 @@ def test_deprecations():
assert extract(htmlstring, no_fallback=True, config=ZERO_CONFIG) is not None
assert bare_extraction(htmlstring, no_fallback=True, config=ZERO_CONFIG) is not None
assert bare_extraction(htmlstring, as_dict=True, config=ZERO_CONFIG) is not None
+ with pytest.raises(ValueError):
+ extract(htmlstring, max_tree_size=100)
+ with pytest.raises(ValueError):
+ bare_extraction(htmlstring, max_tree_size=100)
diff --git a/trafilatura/core.py b/trafilatura/core.py
index cc0c554e..79e424e1 100644
--- a/trafilatura/core.py
+++ b/trafilatura/core.py
@@ -177,7 +177,6 @@ def bare_extraction(
with_metadata: Extract metadata fields and add them to the output.
only_with_metadata: Only keep documents featuring all essential metadata
(date, title, url).
- max_tree_size: Discard documents with too many elements.
url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
as_dict: Will be deprecated, use the .as_dict() method of the document class.
@@ -205,6 +204,8 @@ def bare_extraction(
'"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results',
PendingDeprecationWarning
)
+ if max_tree_size:
+ raise ValueError("max_tree_size is deprecated, use settings.cfg file instead")
# regroup extraction options
if not options or not isinstance(options, Extractor):
@@ -221,7 +222,6 @@ def bare_extraction(
tables=include_tables,
dedup=deduplicate,
lang=target_language,
- max_tree_size=max_tree_size,
url=url,
with_metadata=with_metadata,
only_with_metadata=only_with_metadata,
@@ -412,7 +412,6 @@ def extract(
with_metadata: Extract metadata fields and add them to the output.
only_with_metadata: Only keep documents featuring all essential metadata
(date, title, url).
- max_tree_size: Discard documents with too many elements.
url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
settingsfile: Use a configuration file to override the standard settings.
@@ -432,6 +431,9 @@ def extract(
PendingDeprecationWarning
)
+ if max_tree_size:
+ raise ValueError("max_tree_size is deprecated, use settings.cfg file instead")
+
# regroup extraction options
if not options or not isinstance(options, Extractor):
options = Extractor(
@@ -447,7 +449,6 @@ def extract(
tables=include_tables,
dedup=deduplicate,
lang=target_language,
- max_tree_size=max_tree_size,
url=url,
with_metadata=with_metadata,
only_with_metadata=only_with_metadata,
diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg
index 7c6596fc..88496c2b 100644
--- a/trafilatura/settings.cfg
+++ b/trafilatura/settings.cfg
@@ -29,6 +29,10 @@ MIN_OUTPUT_SIZE = 1
MIN_OUTPUT_COMM_SIZE = 1
+# discard documents with too many elements
+MAX_TREE_SIZE =
+
+
# CLI file processing only, set to 0 to disable
EXTRACTION_TIMEOUT = 30
diff --git a/trafilatura/settings.py b/trafilatura/settings.py
index 91243133..2341f7e9 100644
--- a/trafilatura/settings.py
+++ b/trafilatura/settings.py
@@ -113,7 +113,6 @@ def __init__(
tables: bool = True,
dedup: bool = False,
lang: Optional[str] = None,
- max_tree_size: Optional[int] = None,
url: Optional[str] = None,
source: Optional[str] = None,
with_metadata: bool = False,
@@ -137,7 +136,6 @@ def __init__(
self.tables: bool = tables
self.dedup: bool = dedup
self.lang: Optional[str] = lang
- self.max_tree_size: Optional[int] = max_tree_size
self.url: Optional[str] = url
self.only_with_metadata: bool = only_with_metadata
self.tei_validation: bool = tei_validation
@@ -152,6 +150,7 @@ def __init__(
self.date_params: Dict[str, Any] = date_params or set_date_params(
self.config.getboolean("DEFAULT", "EXTENSIVE_DATE_SEARCH")
)
+ self.max_tree_size = None
def _set_source(self, url: Optional[str], source: Optional[str]) -> None:
"Set the source attribute in a robust way."