Skip to content

Commit

Permalink
CLI downloads: use all information in settings file (#734)
Browse files Browse the repository at this point in the history
* CLI downloads: use all information in settings file

* split buffered_downloads function
  • Loading branch information
adbar authored Nov 1, 2024
1 parent 3b86697 commit d695b68
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 27 deletions.
20 changes: 16 additions & 4 deletions tests/resources/newsettings.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,42 @@
DOWNLOAD_TIMEOUT = 10
MAX_FILE_SIZE = 20000000
MIN_FILE_SIZE = 10

# sleep between requests
SLEEP_TIME = 0.25
# List of user-agents. Each user-agent should be put on a new line like so:
# "agent1"
# "agent2"
# ...

# one line per user-agent
USER_AGENTS =
Firefox
Chrome

# cookie for HTTP requests

COOKIE = yummy_cookie=choco; tasty_cookie=strawberry

# maximum number of redirects that we will follow
MAX_REDIRECTS = 2


# Extraction
MIN_EXTRACTED_SIZE = 10000
MIN_EXTRACTED_COMM_SIZE = 10000
MIN_OUTPUT_SIZE = 10000
MIN_OUTPUT_COMM_SIZE = 10000


# Set to 0 to disable signal
EXTRACTION_TIMEOUT = 0


# Deduplication
MIN_DUPLCHECK_SIZE = 10
MAX_REPETITIONS = 3


# Extraction option for Htmldate
EXTENSIVE_DATE_SEARCH = off


# URLs in feeds and sitemaps
EXTERNAL_URLS = off
19 changes: 15 additions & 4 deletions tests/resources/zerolength.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,41 @@
DOWNLOAD_TIMEOUT = 10
MAX_FILE_SIZE = 20000000
MIN_FILE_SIZE = 10

# sleep between requests
SLEEP_TIME = 0.25
# List of user-agents. Each user-agent should be put on a new line like so:
# "agent1"
# "agent2"
# ...

# one line per user-agent
USER_AGENTS =
Firefox
Chrome

# cookie for HTTP requests
COOKIE = yummy_cookie=choco; tasty_cookie=strawberry

# maximum number of redirects that we will follow
MAX_REDIRECTS = 2


# Extraction
MIN_EXTRACTED_SIZE = 0
MIN_EXTRACTED_COMM_SIZE = 0
MIN_OUTPUT_SIZE = 0
MIN_OUTPUT_COMM_SIZE = 0


# Set to 0 to disable signal
EXTRACTION_TIMEOUT = 0


# Deduplication
MIN_DUPLCHECK_SIZE = 10
MAX_REPETITIONS = 3


# Extraction option for Htmldate
EXTENSIVE_DATE_SEARCH = off


# URLs in feeds and sitemaps
EXTERNAL_URLS = off
16 changes: 12 additions & 4 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,13 @@
from .baseline import html2txt
from .core import extract
from .deduplication import generate_bow_hash
from .downloads import Response, add_to_compressed_dict, buffered_downloads, load_download_buffer
from .downloads import (
Response,
add_to_compressed_dict,
buffered_downloads,
buffered_response_downloads,
load_download_buffer
)
from .feeds import find_feed_urls
from .meta import reset_caches
from .settings import (
Expand Down Expand Up @@ -377,8 +383,8 @@ def cli_crawler(
bufferlist, spider.URL_STORE = load_download_buffer(
spider.URL_STORE, sleep_time
)
for url, result in buffered_downloads(
bufferlist, args.parallel, decode=False, options=options
for url, result in buffered_response_downloads(
bufferlist, args.parallel, options=options
):
if result and isinstance(result, Response):
spider.process_response(result, param_dict[get_base_url(url)])
Expand All @@ -394,7 +400,9 @@ def probe_homepage(args: Any) -> None:
input_urls = load_input_urls(args)
options = args_to_extractor(args)

for url, result in buffered_downloads(input_urls, args.parallel, options=options):
for url, result in buffered_downloads(
input_urls, args.parallel, options=options
):
if result is not None:
result = html2txt(result)
if (
Expand Down
46 changes: 33 additions & 13 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
from importlib.metadata import version
from io import BytesIO
from time import sleep
from typing import (Any, ByteString, Dict, Generator, List, Optional, Set,
Tuple, Union)
from typing import (Any, ByteString, Callable, Dict, Generator, List,
Optional, Set, Tuple, Union)

import certifi
import urllib3
Expand Down Expand Up @@ -248,6 +248,7 @@ def fetch_url(
Unicode string or None in case of failed downloads and invalid results.
"""
config = options.config if options else config
response = fetch_response(url, decode=True, no_ssl=no_ssl, config=config)
if response and response.data:
if not options:
Expand Down Expand Up @@ -370,24 +371,43 @@ def load_download_buffer(
return bufferlist, url_store


def buffered_downloads(
def _buffered_downloads(
bufferlist: List[str],
download_threads: int,
decode: bool = True,
options: Optional[Extractor] = None,
) -> Generator[Tuple[str, Union[Response, str]], None, None]:
"""Download queue consumer, single- or multi-threaded."""
worker = partial(fetch_url, options=options) if decode else fetch_response
worker: Callable[[str], Any],
chunksize: int = 10000,
) -> Generator[Tuple[str, Any], None, None]:
"Use a thread pool to perform a series of downloads."
with ThreadPoolExecutor(max_workers=download_threads) as executor:
for chunk in make_chunks(bufferlist, 10000):
future_to_url: Dict[Any, str] = {
executor.submit(worker, url): url for url in chunk
}
for chunk in make_chunks(bufferlist, chunksize):
future_to_url = {executor.submit(worker, url): url for url in chunk}
for future in as_completed(future_to_url):
# url and download result
yield future_to_url[future], future.result()


def buffered_downloads(
bufferlist: List[str],
download_threads: int,
options: Optional[Extractor] = None,
) -> Generator[Tuple[str, str], None, None]:
"Download queue consumer, single- or multi-threaded."
worker = partial(fetch_url, options=options)

return _buffered_downloads(bufferlist, download_threads, worker)


def buffered_response_downloads(
bufferlist: List[str],
download_threads: int,
options: Optional[Extractor] = None,
) -> Generator[Tuple[str, Response], None, None]:
"Download queue consumer, returns full Response objects."
config = options.config if options else DEFAULT_CONFIG
worker = partial(fetch_response, config=config)

return _buffered_downloads(bufferlist, download_threads, worker)


def _send_pycurl_request(
url: str, no_ssl: bool, with_headers: bool, config: ConfigParser
) -> Optional[Response]:
Expand Down
15 changes: 13 additions & 2 deletions trafilatura/settings.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,41 @@
DOWNLOAD_TIMEOUT = 30
MAX_FILE_SIZE = 20000000
MIN_FILE_SIZE = 10

# sleep between requests
SLEEP_TIME = 5.0
# user-agents here: agent1,agent2,...

# one line per user-agent
USER_AGENTS =
# "agent1"
# "agent2"

# cookie for HTTP requests
COOKIE =
# Maximum number of redirects that we will follow

# maximum number of redirects that we will follow
MAX_REDIRECTS = 2


# Extraction
MIN_EXTRACTED_SIZE = 250
MIN_EXTRACTED_COMM_SIZE = 1
MIN_OUTPUT_SIZE = 1
MIN_OUTPUT_COMM_SIZE = 1


# CLI file processing only, set to 0 to disable
EXTRACTION_TIMEOUT = 30


# Deduplication
MIN_DUPLCHECK_SIZE = 100
MAX_REPETITIONS = 2


# Extraction option for Htmldate
EXTENSIVE_DATE_SEARCH = on


# URLs in feeds and sitemaps
EXTERNAL_URLS = off

0 comments on commit d695b68

Please sign in to comment.