diff --git a/tests/filters_tests.py b/tests/filters_tests.py index 5b6e555a..15d5eb04 100644 --- a/tests/filters_tests.py +++ b/tests/filters_tests.py @@ -7,7 +7,7 @@ from trafilatura import extract from trafilatura.metadata import Document -from trafilatura.settings import DEFAULT_CONFIG +from trafilatura.settings import DEFAULT_CONFIG, Extractor from trafilatura.utils import LANGID_FLAG, check_html_lang, language_filter @@ -35,17 +35,22 @@ def test_filters(): assert language_filter('Hier ist ein Text.', '', 'en', SAMPLE_META)[0] is False # test URL blacklist assert extract('', output_format='xml', url_blacklist={'https://example.org'}) is None + ## recursion limit + options = Extractor() + options.max_tree_size = 500 my_p = '

abc

' doc = html.fromstring('' + my_p*50 + '') - assert extract(doc, max_tree_size=500) is not None + assert extract(doc, options=options) is not None doc = html.fromstring('' + my_p*501 + '') - assert extract(doc, max_tree_size=500) is None + assert extract(doc, options=options) is None + + options.formatting = True my_p = '

abc

' doc = html.fromstring('' + my_p*501 + '') - assert extract(doc, include_formatting=True, max_tree_size=500) is None + assert extract(doc, options=options) is None doc = html.fromstring('' + my_p*499 + '') - assert extract(doc, include_formatting=True, max_tree_size=500) is not None + assert extract(doc, options=options) is not None # HTML lang filter # no lang diff --git a/tests/resources/zerolength.cfg b/tests/resources/zerolength.cfg index f20cdbc4..2bc9da85 100644 --- a/tests/resources/zerolength.cfg +++ b/tests/resources/zerolength.cfg @@ -29,6 +29,10 @@ MIN_OUTPUT_SIZE = 0 MIN_OUTPUT_COMM_SIZE = 0 +# discard documents with too many elements +MAX_TREE_SIZE = 100 + + # Set to 0 to disable signal EXTRACTION_TIMEOUT = 0 diff --git a/tests/unit_tests.py b/tests/unit_tests.py index f362f0a3..00046804 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1566,6 +1566,10 @@ def test_deprecations(): assert extract(htmlstring, no_fallback=True, config=ZERO_CONFIG) is not None assert bare_extraction(htmlstring, no_fallback=True, config=ZERO_CONFIG) is not None assert bare_extraction(htmlstring, as_dict=True, config=ZERO_CONFIG) is not None + with pytest.raises(ValueError): + extract(htmlstring, max_tree_size=100) + with pytest.raises(ValueError): + bare_extraction(htmlstring, max_tree_size=100) diff --git a/trafilatura/core.py b/trafilatura/core.py index cc0c554e..79e424e1 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -177,7 +177,6 @@ def bare_extraction( with_metadata: Extract metadata fields and add them to the output. only_with_metadata: Only keep documents featuring all essential metadata (date, title, url). - max_tree_size: Discard documents with too many elements. url_blacklist: Provide a blacklist of URLs as set() to filter out documents. author_blacklist: Provide a blacklist of Author Names as set() to filter out authors. as_dict: Will be deprecated, use the .as_dict() method of the document class. @@ -205,6 +204,8 @@ def bare_extraction( '"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results', PendingDeprecationWarning ) + if max_tree_size: + raise ValueError("max_tree_size is deprecated, use settings.cfg file instead") # regroup extraction options if not options or not isinstance(options, Extractor): @@ -221,7 +222,6 @@ def bare_extraction( tables=include_tables, dedup=deduplicate, lang=target_language, - max_tree_size=max_tree_size, url=url, with_metadata=with_metadata, only_with_metadata=only_with_metadata, @@ -412,7 +412,6 @@ def extract( with_metadata: Extract metadata fields and add them to the output. only_with_metadata: Only keep documents featuring all essential metadata (date, title, url). - max_tree_size: Discard documents with too many elements. url_blacklist: Provide a blacklist of URLs as set() to filter out documents. author_blacklist: Provide a blacklist of Author Names as set() to filter out authors. settingsfile: Use a configuration file to override the standard settings. @@ -432,6 +431,9 @@ def extract( PendingDeprecationWarning ) + if max_tree_size: + raise ValueError("max_tree_size is deprecated, use settings.cfg file instead") + # regroup extraction options if not options or not isinstance(options, Extractor): options = Extractor( @@ -447,7 +449,6 @@ def extract( tables=include_tables, dedup=deduplicate, lang=target_language, - max_tree_size=max_tree_size, url=url, with_metadata=with_metadata, only_with_metadata=only_with_metadata, diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg index 7c6596fc..88496c2b 100644 --- a/trafilatura/settings.cfg +++ b/trafilatura/settings.cfg @@ -29,6 +29,10 @@ MIN_OUTPUT_SIZE = 1 MIN_OUTPUT_COMM_SIZE = 1 +# discard documents with too many elements +MAX_TREE_SIZE = + + # CLI file processing only, set to 0 to disable EXTRACTION_TIMEOUT = 30 diff --git a/trafilatura/settings.py b/trafilatura/settings.py index 91243133..2341f7e9 100644 --- a/trafilatura/settings.py +++ b/trafilatura/settings.py @@ -113,7 +113,6 @@ def __init__( tables: bool = True, dedup: bool = False, lang: Optional[str] = None, - max_tree_size: Optional[int] = None, url: Optional[str] = None, source: Optional[str] = None, with_metadata: bool = False, @@ -137,7 +136,6 @@ def __init__( self.tables: bool = tables self.dedup: bool = dedup self.lang: Optional[str] = lang - self.max_tree_size: Optional[int] = max_tree_size self.url: Optional[str] = url self.only_with_metadata: bool = only_with_metadata self.tei_validation: bool = tei_validation @@ -152,6 +150,7 @@ def __init__( self.date_params: Dict[str, Any] = date_params or set_date_params( self.config.getboolean("DEFAULT", "EXTENSIVE_DATE_SEARCH") ) + self.max_tree_size = None def _set_source(self, url: Optional[str], source: Optional[str]) -> None: "Set the source attribute in a robust way."