adbar · adbar · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/tests/filters_tests.py b/tests/filters_tests.py
@@ -7,7 +7,7 @@
 
 from trafilatura import extract
 from trafilatura.metadata import Document
-from trafilatura.settings import DEFAULT_CONFIG
+from trafilatura.settings import DEFAULT_CONFIG, Extractor
 from trafilatura.utils import LANGID_FLAG, check_html_lang, language_filter
 
 
@@ -35,17 +35,22 @@ def test_filters():
         assert language_filter('Hier ist ein Text.', '', 'en', SAMPLE_META)[0] is False
     # test URL blacklist
     assert extract('<html><head><link rel="canonical" href="https://example.org"/></head><body></body></html>', output_format='xml', url_blacklist={'https://example.org'}) is None
+
     ## recursion limit
+    options = Extractor()
+    options.max_tree_size = 500
     my_p = '<p>abc</p>'
     doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>')
-    assert extract(doc, max_tree_size=500) is not None
+    assert extract(doc, options=options) is not None
     doc = html.fromstring('<html><body>' + my_p*501 + '</body></html>')
-    assert extract(doc, max_tree_size=500) is None
+    assert extract(doc, options=options) is None
+
+    options.formatting = True
     my_p = '<p><hi rend="#i">abc</hi></p>'
     doc = html.fromstring('<html><body>' + my_p*501 + '</body></html>')
-    assert extract(doc, include_formatting=True, max_tree_size=500) is None
+    assert extract(doc, options=options) is None
     doc = html.fromstring('<html><body>' + my_p*499 + '</body></html>')
-    assert extract(doc, include_formatting=True, max_tree_size=500) is not None
+    assert extract(doc, options=options) is not None
 
     # HTML lang filter
     # no lang

diff --git a/tests/resources/zerolength.cfg b/tests/resources/zerolength.cfg
@@ -29,6 +29,10 @@ MIN_OUTPUT_SIZE = 0
 MIN_OUTPUT_COMM_SIZE = 0
 
 
+# discard documents with too many elements
+MAX_TREE_SIZE = 100
+
+
 # Set to 0 to disable signal
 EXTRACTION_TIMEOUT = 0
 

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -1566,6 +1566,10 @@ def test_deprecations():
     assert extract(htmlstring, no_fallback=True, config=ZERO_CONFIG) is not None
     assert bare_extraction(htmlstring, no_fallback=True, config=ZERO_CONFIG) is not None
     assert bare_extraction(htmlstring, as_dict=True, config=ZERO_CONFIG) is not None
+    with pytest.raises(ValueError):
+        extract(htmlstring, max_tree_size=100)
+    with pytest.raises(ValueError):
+        bare_extraction(htmlstring, max_tree_size=100)
 
 
 

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -177,7 +177,6 @@ def bare_extraction(
         with_metadata: Extract metadata fields and add them to the output.
         only_with_metadata: Only keep documents featuring all essential metadata
             (date, title, url).
-        max_tree_size: Discard documents with too many elements.
         url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
         author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
         as_dict: Will be deprecated, use the .as_dict() method of the document class.
@@ -205,6 +204,8 @@ def bare_extraction(
             '"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results',
             PendingDeprecationWarning
         )
+    if max_tree_size:
+        raise ValueError("max_tree_size is deprecated, use settings.cfg file instead")
 
     # regroup extraction options
     if not options or not isinstance(options, Extractor):
@@ -221,7 +222,6 @@ def bare_extraction(
             tables=include_tables,
             dedup=deduplicate,
             lang=target_language,
-            max_tree_size=max_tree_size,
             url=url,
             with_metadata=with_metadata,
             only_with_metadata=only_with_metadata,
@@ -412,7 +412,6 @@ def extract(
         with_metadata: Extract metadata fields and add them to the output.
         only_with_metadata: Only keep documents featuring all essential metadata
             (date, title, url).
-        max_tree_size: Discard documents with too many elements.
         url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
         author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
         settingsfile: Use a configuration file to override the standard settings.
@@ -432,6 +431,9 @@ def extract(
             PendingDeprecationWarning
         )
 
+    if max_tree_size:
+        raise ValueError("max_tree_size is deprecated, use settings.cfg file instead")
+
     # regroup extraction options
     if not options or not isinstance(options, Extractor):
         options = Extractor(
@@ -447,7 +449,6 @@ def extract(
             tables=include_tables,
             dedup=deduplicate,
             lang=target_language,
-            max_tree_size=max_tree_size,
             url=url,
             with_metadata=with_metadata,
             only_with_metadata=only_with_metadata,

diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg
@@ -29,6 +29,10 @@ MIN_OUTPUT_SIZE = 1
 MIN_OUTPUT_COMM_SIZE = 1
 
 
+# discard documents with too many elements
+MAX_TREE_SIZE = 
+
+
 # CLI file processing only, set to 0 to disable
 EXTRACTION_TIMEOUT = 30
 

diff --git a/trafilatura/settings.py b/trafilatura/settings.py
@@ -113,7 +113,6 @@ def __init__(
         tables: bool = True,
         dedup: bool = False,
         lang: Optional[str] = None,
-        max_tree_size: Optional[int] = None,
         url: Optional[str] = None,
         source: Optional[str] = None,
         with_metadata: bool = False,
@@ -137,7 +136,6 @@ def __init__(
         self.tables: bool = tables
         self.dedup: bool = dedup
         self.lang: Optional[str] = lang
-        self.max_tree_size: Optional[int] = max_tree_size
         self.url: Optional[str] = url
         self.only_with_metadata: bool = only_with_metadata
         self.tei_validation: bool = tei_validation
@@ -152,6 +150,7 @@ def __init__(
         self.date_params: Dict[str, Any] = date_params or set_date_params(
             self.config.getboolean("DEFAULT", "EXTENSIVE_DATE_SEARCH")
         )
+        self.max_tree_size = None
 
     def _set_source(self, url: Optional[str], source: Optional[str]) -> None:
         "Set the source attribute in a robust way."