From d695b680098bc1686bd9adfce0244c0ae5d99d03 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 1 Nov 2024 14:09:05 +0100 Subject: [PATCH] CLI downloads: use all information in settings file (#734) * CLI downloads: use all information in settings file * split buffered_downloads function --- tests/resources/newsettings.cfg | 20 +++++++++++--- tests/resources/zerolength.cfg | 19 +++++++++++--- trafilatura/cli_utils.py | 16 +++++++++--- trafilatura/downloads.py | 46 +++++++++++++++++++++++---------- trafilatura/settings.cfg | 15 +++++++++-- 5 files changed, 89 insertions(+), 27 deletions(-) diff --git a/tests/resources/newsettings.cfg b/tests/resources/newsettings.cfg index 9614d4bc..c6f63f62 100644 --- a/tests/resources/newsettings.cfg +++ b/tests/resources/newsettings.cfg @@ -6,30 +6,42 @@ DOWNLOAD_TIMEOUT = 10 MAX_FILE_SIZE = 20000000 MIN_FILE_SIZE = 10 + # sleep between requests SLEEP_TIME = 0.25 -# List of user-agents. Each user-agent should be put on a new line like so: -# "agent1" -# "agent2" -# ... + +# one line per user-agent USER_AGENTS = Firefox Chrome + # cookie for HTTP requests + COOKIE = yummy_cookie=choco; tasty_cookie=strawberry +# maximum number of redirects that we will follow +MAX_REDIRECTS = 2 + + # Extraction MIN_EXTRACTED_SIZE = 10000 MIN_EXTRACTED_COMM_SIZE = 10000 MIN_OUTPUT_SIZE = 10000 MIN_OUTPUT_COMM_SIZE = 10000 + # Set to 0 to disable signal EXTRACTION_TIMEOUT = 0 + # Deduplication MIN_DUPLCHECK_SIZE = 10 MAX_REPETITIONS = 3 + # Extraction option for Htmldate EXTENSIVE_DATE_SEARCH = off + + +# URLs in feeds and sitemaps +EXTERNAL_URLS = off diff --git a/tests/resources/zerolength.cfg b/tests/resources/zerolength.cfg index ed6a0941..f20cdbc4 100644 --- a/tests/resources/zerolength.cfg +++ b/tests/resources/zerolength.cfg @@ -6,30 +6,41 @@ DOWNLOAD_TIMEOUT = 10 MAX_FILE_SIZE = 20000000 MIN_FILE_SIZE = 10 + # sleep between requests SLEEP_TIME = 0.25 -# List of user-agents. Each user-agent should be put on a new line like so: -# "agent1" -# "agent2" -# ... + +# one line per user-agent USER_AGENTS = Firefox Chrome + # cookie for HTTP requests COOKIE = yummy_cookie=choco; tasty_cookie=strawberry +# maximum number of redirects that we will follow +MAX_REDIRECTS = 2 + + # Extraction MIN_EXTRACTED_SIZE = 0 MIN_EXTRACTED_COMM_SIZE = 0 MIN_OUTPUT_SIZE = 0 MIN_OUTPUT_COMM_SIZE = 0 + # Set to 0 to disable signal EXTRACTION_TIMEOUT = 0 + # Deduplication MIN_DUPLCHECK_SIZE = 10 MAX_REPETITIONS = 3 + # Extraction option for Htmldate EXTENSIVE_DATE_SEARCH = off + + +# URLs in feeds and sitemaps +EXTERNAL_URLS = off diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py index 6cedf374..cb07890c 100644 --- a/trafilatura/cli_utils.py +++ b/trafilatura/cli_utils.py @@ -29,7 +29,13 @@ from .baseline import html2txt from .core import extract from .deduplication import generate_bow_hash -from .downloads import Response, add_to_compressed_dict, buffered_downloads, load_download_buffer +from .downloads import ( + Response, + add_to_compressed_dict, + buffered_downloads, + buffered_response_downloads, + load_download_buffer +) from .feeds import find_feed_urls from .meta import reset_caches from .settings import ( @@ -377,8 +383,8 @@ def cli_crawler( bufferlist, spider.URL_STORE = load_download_buffer( spider.URL_STORE, sleep_time ) - for url, result in buffered_downloads( - bufferlist, args.parallel, decode=False, options=options + for url, result in buffered_response_downloads( + bufferlist, args.parallel, options=options ): if result and isinstance(result, Response): spider.process_response(result, param_dict[get_base_url(url)]) @@ -394,7 +400,9 @@ def probe_homepage(args: Any) -> None: input_urls = load_input_urls(args) options = args_to_extractor(args) - for url, result in buffered_downloads(input_urls, args.parallel, options=options): + for url, result in buffered_downloads( + input_urls, args.parallel, options=options + ): if result is not None: result = html2txt(result) if ( diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py index 2bc81afb..4aa5d4cb 100644 --- a/trafilatura/downloads.py +++ b/trafilatura/downloads.py @@ -13,8 +13,8 @@ from importlib.metadata import version from io import BytesIO from time import sleep -from typing import (Any, ByteString, Dict, Generator, List, Optional, Set, - Tuple, Union) +from typing import (Any, ByteString, Callable, Dict, Generator, List, + Optional, Set, Tuple, Union) import certifi import urllib3 @@ -248,6 +248,7 @@ def fetch_url( Unicode string or None in case of failed downloads and invalid results. """ + config = options.config if options else config response = fetch_response(url, decode=True, no_ssl=no_ssl, config=config) if response and response.data: if not options: @@ -370,24 +371,43 @@ def load_download_buffer( return bufferlist, url_store -def buffered_downloads( +def _buffered_downloads( bufferlist: List[str], download_threads: int, - decode: bool = True, - options: Optional[Extractor] = None, -) -> Generator[Tuple[str, Union[Response, str]], None, None]: - """Download queue consumer, single- or multi-threaded.""" - worker = partial(fetch_url, options=options) if decode else fetch_response + worker: Callable[[str], Any], + chunksize: int = 10000, +) -> Generator[Tuple[str, Any], None, None]: + "Use a thread pool to perform a series of downloads." with ThreadPoolExecutor(max_workers=download_threads) as executor: - for chunk in make_chunks(bufferlist, 10000): - future_to_url: Dict[Any, str] = { - executor.submit(worker, url): url for url in chunk - } + for chunk in make_chunks(bufferlist, chunksize): + future_to_url = {executor.submit(worker, url): url for url in chunk} for future in as_completed(future_to_url): - # url and download result yield future_to_url[future], future.result() +def buffered_downloads( + bufferlist: List[str], + download_threads: int, + options: Optional[Extractor] = None, +) -> Generator[Tuple[str, str], None, None]: + "Download queue consumer, single- or multi-threaded." + worker = partial(fetch_url, options=options) + + return _buffered_downloads(bufferlist, download_threads, worker) + + +def buffered_response_downloads( + bufferlist: List[str], + download_threads: int, + options: Optional[Extractor] = None, +) -> Generator[Tuple[str, Response], None, None]: + "Download queue consumer, returns full Response objects." + config = options.config if options else DEFAULT_CONFIG + worker = partial(fetch_response, config=config) + + return _buffered_downloads(bufferlist, download_threads, worker) + + def _send_pycurl_request( url: str, no_ssl: bool, with_headers: bool, config: ConfigParser ) -> Optional[Response]: diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg index 11a07895..7c6596fc 100644 --- a/trafilatura/settings.cfg +++ b/trafilatura/settings.cfg @@ -6,30 +6,41 @@ DOWNLOAD_TIMEOUT = 30 MAX_FILE_SIZE = 20000000 MIN_FILE_SIZE = 10 + # sleep between requests SLEEP_TIME = 5.0 -# user-agents here: agent1,agent2,... + +# one line per user-agent USER_AGENTS = +# "agent1" +# "agent2" + # cookie for HTTP requests COOKIE = -# Maximum number of redirects that we will follow + +# maximum number of redirects that we will follow MAX_REDIRECTS = 2 + # Extraction MIN_EXTRACTED_SIZE = 250 MIN_EXTRACTED_COMM_SIZE = 1 MIN_OUTPUT_SIZE = 1 MIN_OUTPUT_COMM_SIZE = 1 + # CLI file processing only, set to 0 to disable EXTRACTION_TIMEOUT = 30 + # Deduplication MIN_DUPLCHECK_SIZE = 100 MAX_REPETITIONS = 2 + # Extraction option for Htmldate EXTENSIVE_DATE_SEARCH = on + # URLs in feeds and sitemaps EXTERNAL_URLS = off