From d695b680098bc1686bd9adfce0244c0ae5d99d03 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Fri, 1 Nov 2024 14:09:05 +0100
Subject: [PATCH] CLI downloads: use all information in settings file (#734)

* CLI downloads: use all information in settings file

* split buffered_downloads function
---
 tests/resources/newsettings.cfg | 20 +++++++++++---
 tests/resources/zerolength.cfg  | 19 +++++++++++---
 trafilatura/cli_utils.py        | 16 +++++++++---
 trafilatura/downloads.py        | 46 +++++++++++++++++++++++----------
 trafilatura/settings.cfg        | 15 +++++++++--
 5 files changed, 89 insertions(+), 27 deletions(-)

diff --git a/tests/resources/newsettings.cfg b/tests/resources/newsettings.cfg
index 9614d4bc..c6f63f62 100644
--- a/tests/resources/newsettings.cfg
+++ b/tests/resources/newsettings.cfg
@@ -6,30 +6,42 @@
 DOWNLOAD_TIMEOUT = 10
 MAX_FILE_SIZE = 20000000
 MIN_FILE_SIZE = 10
+
 # sleep between requests
 SLEEP_TIME = 0.25
-# List of user-agents. Each user-agent should be put on a new line like so:
-#     "agent1"
-#     "agent2"
-#     ...
+
+# one line per user-agent
 USER_AGENTS =
     Firefox
     Chrome
+    
 # cookie for HTTP requests
+
 COOKIE = yummy_cookie=choco; tasty_cookie=strawberry
 
+# maximum number of redirects that we will follow
+MAX_REDIRECTS = 2
+
+
 # Extraction
 MIN_EXTRACTED_SIZE = 10000
 MIN_EXTRACTED_COMM_SIZE = 10000
 MIN_OUTPUT_SIZE = 10000
 MIN_OUTPUT_COMM_SIZE = 10000
 
+
 # Set to 0 to disable signal
 EXTRACTION_TIMEOUT = 0
 
+
 # Deduplication
 MIN_DUPLCHECK_SIZE = 10
 MAX_REPETITIONS = 3
 
+
 # Extraction option for Htmldate
 EXTENSIVE_DATE_SEARCH = off
+
+
+# URLs in feeds and sitemaps
+EXTERNAL_URLS = off
diff --git a/tests/resources/zerolength.cfg b/tests/resources/zerolength.cfg
index ed6a0941..f20cdbc4 100644
--- a/tests/resources/zerolength.cfg
+++ b/tests/resources/zerolength.cfg
@@ -6,30 +6,41 @@
 DOWNLOAD_TIMEOUT = 10
 MAX_FILE_SIZE = 20000000
 MIN_FILE_SIZE = 10
+
 # sleep between requests
 SLEEP_TIME = 0.25
-# List of user-agents. Each user-agent should be put on a new line like so:
-#     "agent1"
-#     "agent2"
-#     ...
+
+# one line per user-agent
 USER_AGENTS =
     Firefox
     Chrome
+    
 # cookie for HTTP requests
 COOKIE = yummy_cookie=choco; tasty_cookie=strawberry
 
+# maximum number of redirects that we will follow
+MAX_REDIRECTS = 2
+
+
 # Extraction
 MIN_EXTRACTED_SIZE = 0
 MIN_EXTRACTED_COMM_SIZE = 0
 MIN_OUTPUT_SIZE = 0
 MIN_OUTPUT_COMM_SIZE = 0
 
+
 # Set to 0 to disable signal
 EXTRACTION_TIMEOUT = 0
 
+
 # Deduplication
 MIN_DUPLCHECK_SIZE = 10
 MAX_REPETITIONS = 3
 
+
 # Extraction option for Htmldate
 EXTENSIVE_DATE_SEARCH = off
+
+
+# URLs in feeds and sitemaps
+EXTERNAL_URLS = off
diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
index 6cedf374..cb07890c 100644
--- a/trafilatura/cli_utils.py
+++ b/trafilatura/cli_utils.py
@@ -29,7 +29,13 @@
 from .baseline import html2txt
 from .core import extract
 from .deduplication import generate_bow_hash
-from .downloads import Response, add_to_compressed_dict, buffered_downloads, load_download_buffer
+from .downloads import (
+    Response,
+    add_to_compressed_dict,
+    buffered_downloads,
+    buffered_response_downloads,
+    load_download_buffer
+)
 from .feeds import find_feed_urls
 from .meta import reset_caches
 from .settings import (
@@ -377,8 +383,8 @@ def cli_crawler(
         bufferlist, spider.URL_STORE = load_download_buffer(
             spider.URL_STORE, sleep_time
         )
-        for url, result in buffered_downloads(
-            bufferlist, args.parallel, decode=False, options=options
+        for url, result in buffered_response_downloads(
+            bufferlist, args.parallel, options=options
         ):
             if result and isinstance(result, Response):
                 spider.process_response(result, param_dict[get_base_url(url)])
@@ -394,7 +400,9 @@ def probe_homepage(args: Any) -> None:
     input_urls = load_input_urls(args)
     options = args_to_extractor(args)
 
-    for url, result in buffered_downloads(input_urls, args.parallel, options=options):
+    for url, result in buffered_downloads(
+        input_urls, args.parallel, options=options
+    ):
         if result is not None:
             result = html2txt(result)
             if (
diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py
index 2bc81afb..4aa5d4cb 100644
--- a/trafilatura/downloads.py
+++ b/trafilatura/downloads.py
@@ -13,8 +13,8 @@
 from importlib.metadata import version
 from io import BytesIO
 from time import sleep
-from typing import (Any, ByteString, Dict, Generator, List, Optional, Set,
-                    Tuple, Union)
+from typing import (Any, ByteString, Callable, Dict, Generator, List,
+                     Optional, Set, Tuple, Union)
 
 import certifi
 import urllib3
@@ -248,6 +248,7 @@ def fetch_url(
         Unicode string or None in case of failed downloads and invalid results.
 
     """
+    config = options.config if options else config
     response = fetch_response(url, decode=True, no_ssl=no_ssl, config=config)
     if response and response.data:
         if not options:
@@ -370,24 +371,43 @@ def load_download_buffer(
     return bufferlist, url_store
 
 
-def buffered_downloads(
+def _buffered_downloads(
     bufferlist: List[str],
     download_threads: int,
-    decode: bool = True,
-    options: Optional[Extractor] = None,
-) -> Generator[Tuple[str, Union[Response, str]], None, None]:
-    """Download queue consumer, single- or multi-threaded."""
-    worker = partial(fetch_url, options=options) if decode else fetch_response
+    worker: Callable[[str], Any],
+    chunksize: int = 10000,
+) -> Generator[Tuple[str, Any], None, None]:
+    "Use a thread pool to perform a series of downloads."
     with ThreadPoolExecutor(max_workers=download_threads) as executor:
-        for chunk in make_chunks(bufferlist, 10000):
-            future_to_url: Dict[Any, str] = {
-                executor.submit(worker, url): url for url in chunk
-            }
+        for chunk in make_chunks(bufferlist, chunksize):
+            future_to_url = {executor.submit(worker, url): url for url in chunk}
             for future in as_completed(future_to_url):
-                # url and download result
                 yield future_to_url[future], future.result()
 
 
+def buffered_downloads(
+    bufferlist: List[str],
+    download_threads: int,
+    options: Optional[Extractor] = None,
+) -> Generator[Tuple[str, str], None, None]:
+    "Download queue consumer, single- or multi-threaded."
+    worker = partial(fetch_url, options=options)
+
+    return _buffered_downloads(bufferlist, download_threads, worker)
+
+
+def buffered_response_downloads(
+    bufferlist: List[str],
+    download_threads: int,
+    options: Optional[Extractor] = None,
+) -> Generator[Tuple[str, Response], None, None]:
+    "Download queue consumer, returns full Response objects."
+    config = options.config if options else DEFAULT_CONFIG
+    worker = partial(fetch_response, config=config)
+
+    return _buffered_downloads(bufferlist, download_threads, worker)
+
+
 def _send_pycurl_request(
     url: str, no_ssl: bool, with_headers: bool, config: ConfigParser
 ) -> Optional[Response]:
diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg
index 11a07895..7c6596fc 100644
--- a/trafilatura/settings.cfg
+++ b/trafilatura/settings.cfg
@@ -6,30 +6,41 @@
 DOWNLOAD_TIMEOUT = 30
 MAX_FILE_SIZE = 20000000
 MIN_FILE_SIZE = 10
+
 # sleep between requests
 SLEEP_TIME = 5.0
-# user-agents here: agent1,agent2,...
+
+# one line per user-agent
 USER_AGENTS =
+#     "agent1"
+#     "agent2"
+
 # cookie for HTTP requests
 COOKIE =
-# Maximum number of redirects that we will follow
+
+# maximum number of redirects that we will follow
 MAX_REDIRECTS = 2
 
+
 # Extraction
 MIN_EXTRACTED_SIZE = 250
 MIN_EXTRACTED_COMM_SIZE = 1
 MIN_OUTPUT_SIZE = 1
 MIN_OUTPUT_COMM_SIZE = 1
 
+
 # CLI file processing only, set to 0 to disable
 EXTRACTION_TIMEOUT = 30
 
+
 # Deduplication
 MIN_DUPLCHECK_SIZE = 100
 MAX_REPETITIONS = 2
 
+
 # Extraction option for Htmldate
 EXTENSIVE_DATE_SEARCH = on
 
+
 # URLs in feeds and sitemaps
 EXTERNAL_URLS = off