Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extraction: move max_tree_size parameter to settings.cfg #742

Merged
merged 2 commits into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions tests/filters_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from trafilatura import extract
from trafilatura.metadata import Document
from trafilatura.settings import DEFAULT_CONFIG
from trafilatura.settings import DEFAULT_CONFIG, Extractor
from trafilatura.utils import LANGID_FLAG, check_html_lang, language_filter


Expand Down Expand Up @@ -35,17 +35,22 @@ def test_filters():
assert language_filter('Hier ist ein Text.', '', 'en', SAMPLE_META)[0] is False
# test URL blacklist
assert extract('<html><head><link rel="canonical" href="https://example.org"/></head><body></body></html>', output_format='xml', url_blacklist={'https://example.org'}) is None

## recursion limit
options = Extractor()
options.max_tree_size = 500
my_p = '<p>abc</p>'
doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>')
assert extract(doc, max_tree_size=500) is not None
assert extract(doc, options=options) is not None
doc = html.fromstring('<html><body>' + my_p*501 + '</body></html>')
assert extract(doc, max_tree_size=500) is None
assert extract(doc, options=options) is None

options.formatting = True
my_p = '<p><hi rend="#i">abc</hi></p>'
doc = html.fromstring('<html><body>' + my_p*501 + '</body></html>')
assert extract(doc, include_formatting=True, max_tree_size=500) is None
assert extract(doc, options=options) is None
doc = html.fromstring('<html><body>' + my_p*499 + '</body></html>')
assert extract(doc, include_formatting=True, max_tree_size=500) is not None
assert extract(doc, options=options) is not None

# HTML lang filter
# no lang
Expand Down
4 changes: 4 additions & 0 deletions tests/resources/zerolength.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ MIN_OUTPUT_SIZE = 0
MIN_OUTPUT_COMM_SIZE = 0


# discard documents with too many elements
MAX_TREE_SIZE = 100


# Set to 0 to disable signal
EXTRACTION_TIMEOUT = 0

Expand Down
4 changes: 4 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1566,6 +1566,10 @@ def test_deprecations():
assert extract(htmlstring, no_fallback=True, config=ZERO_CONFIG) is not None
assert bare_extraction(htmlstring, no_fallback=True, config=ZERO_CONFIG) is not None
assert bare_extraction(htmlstring, as_dict=True, config=ZERO_CONFIG) is not None
with pytest.raises(ValueError):
extract(htmlstring, max_tree_size=100)
with pytest.raises(ValueError):
bare_extraction(htmlstring, max_tree_size=100)



Expand Down
9 changes: 5 additions & 4 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,6 @@ def bare_extraction(
with_metadata: Extract metadata fields and add them to the output.
only_with_metadata: Only keep documents featuring all essential metadata
(date, title, url).
max_tree_size: Discard documents with too many elements.
url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
as_dict: Will be deprecated, use the .as_dict() method of the document class.
Expand Down Expand Up @@ -205,6 +204,8 @@ def bare_extraction(
'"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results',
PendingDeprecationWarning
)
if max_tree_size:
raise ValueError("max_tree_size is deprecated, use settings.cfg file instead")

# regroup extraction options
if not options or not isinstance(options, Extractor):
Expand All @@ -221,7 +222,6 @@ def bare_extraction(
tables=include_tables,
dedup=deduplicate,
lang=target_language,
max_tree_size=max_tree_size,
url=url,
with_metadata=with_metadata,
only_with_metadata=only_with_metadata,
Expand Down Expand Up @@ -412,7 +412,6 @@ def extract(
with_metadata: Extract metadata fields and add them to the output.
only_with_metadata: Only keep documents featuring all essential metadata
(date, title, url).
max_tree_size: Discard documents with too many elements.
url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
settingsfile: Use a configuration file to override the standard settings.
Expand All @@ -432,6 +431,9 @@ def extract(
PendingDeprecationWarning
)

if max_tree_size:
raise ValueError("max_tree_size is deprecated, use settings.cfg file instead")

# regroup extraction options
if not options or not isinstance(options, Extractor):
options = Extractor(
Expand All @@ -447,7 +449,6 @@ def extract(
tables=include_tables,
dedup=deduplicate,
lang=target_language,
max_tree_size=max_tree_size,
url=url,
with_metadata=with_metadata,
only_with_metadata=only_with_metadata,
Expand Down
4 changes: 4 additions & 0 deletions trafilatura/settings.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ MIN_OUTPUT_SIZE = 1
MIN_OUTPUT_COMM_SIZE = 1


# discard documents with too many elements
MAX_TREE_SIZE =


# CLI file processing only, set to 0 to disable
EXTRACTION_TIMEOUT = 30

Expand Down
3 changes: 1 addition & 2 deletions trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def __init__(
tables: bool = True,
dedup: bool = False,
lang: Optional[str] = None,
max_tree_size: Optional[int] = None,
url: Optional[str] = None,
source: Optional[str] = None,
with_metadata: bool = False,
Expand All @@ -137,7 +136,6 @@ def __init__(
self.tables: bool = tables
self.dedup: bool = dedup
self.lang: Optional[str] = lang
self.max_tree_size: Optional[int] = max_tree_size
self.url: Optional[str] = url
self.only_with_metadata: bool = only_with_metadata
self.tei_validation: bool = tei_validation
Expand All @@ -152,6 +150,7 @@ def __init__(
self.date_params: Dict[str, Any] = date_params or set_date_params(
self.config.getboolean("DEFAULT", "EXTENSIVE_DATE_SEARCH")
)
self.max_tree_size = None

def _set_source(self, url: Optional[str], source: Optional[str]) -> None:
"Set the source attribute in a robust way."
Expand Down
Loading