diff --git a/src/zimscraperlib/constants.py b/src/zimscraperlib/constants.py index cc2946c2..3533c615 100644 --- a/src/zimscraperlib/constants.py +++ b/src/zimscraperlib/constants.py @@ -11,6 +11,7 @@ NAME = pathlib.Path(__file__).parent.name SCRAPER = f"{NAME} {__version__}" CONTACT = "dev@openzim.org" +DEFAULT_USER_AGENT = f"{NAME}/{__version__} ({CONTACT})" UTF8 = "UTF-8" diff --git a/src/zimscraperlib/inputs.py b/src/zimscraperlib/inputs.py index 2497f3f6..058ffc76 100644 --- a/src/zimscraperlib/inputs.py +++ b/src/zimscraperlib/inputs.py @@ -7,18 +7,13 @@ from typing import Optional, Tuple, Union from zimscraperlib import logger -from zimscraperlib.constants import ( - CONTACT, -) +from zimscraperlib.constants import DEFAULT_USER_AGENT from zimscraperlib.constants import ( MAXIMUM_DESCRIPTION_METADATA_LENGTH as MAX_DESC_LENGTH, ) from zimscraperlib.constants import ( MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH as MAX_LONG_DESC_LENGTH, ) -from zimscraperlib.constants import ( - SCRAPER as PROJECT_NAME, -) from zimscraperlib.download import stream_file @@ -27,6 +22,7 @@ def handle_user_provided_file( dest: Optional[pathlib.Path] = None, in_dir: Optional[pathlib.Path] = None, nocopy: bool = False, # noqa: FBT001, FBT002 + user_agent: Optional[str] = DEFAULT_USER_AGENT, ) -> Union[pathlib.Path, None]: """path to downloaded or copied a user provided file (URL or path) @@ -48,7 +44,7 @@ def handle_user_provided_file( if str(source).startswith("http"): logger.debug(f"download {source} -> {dest}") - headers = {"User-Agent": f"{PROJECT_NAME.replace(' ','/')} ({CONTACT})"} + headers = {"User-Agent": user_agent} if user_agent else None stream_file(url=str(source), fpath=dest, headers=headers) else: source = pathlib.Path(source).expanduser().resolve() diff --git a/tests/conftest.py b/tests/conftest.py index bee8e890..011e15e2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -120,6 +120,16 @@ def webp_image(): return file_src("ninja.webp") +@pytest.fixture(scope="module") +def valid_user_agent(): + return "name/version (contact)" + + +@pytest.fixture(scope="module") +def invalid_user_agent(): + return "name version) (contact)" + + @pytest.fixture(scope="session") def small_zim_file(tmpdir_factory): from zimscraperlib.download import stream_file diff --git a/tests/inputs/test_inputs.py b/tests/inputs/test_inputs.py index 9377ca4c..af23d904 100644 --- a/tests/inputs/test_inputs.py +++ b/tests/inputs/test_inputs.py @@ -6,12 +6,17 @@ import pytest +import zimscraperlib +from zimscraperlib.constants import CONTACT from zimscraperlib.constants import ( MAXIMUM_DESCRIPTION_METADATA_LENGTH as MAX_DESC_LENGTH, ) from zimscraperlib.constants import ( MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH as MAX_LONG_DESC_LENGTH, ) +from zimscraperlib.constants import ( + NAME as PROJECT_NAME, +) from zimscraperlib.inputs import compute_descriptions, handle_user_provided_file @@ -80,6 +85,37 @@ def test_remote_indir(tmp_path, valid_http_url): assert fpath.parent == tmp_path +def test_remote_default_user_agent(valid_http_url, monkeypatch): + def mock_stream_file(**kwargs): + headers = kwargs.get("headers") + assert headers is not None + user_agent = headers.get("User-Agent") + assert isinstance(user_agent, str) + assert user_agent.startswith(PROJECT_NAME) + assert user_agent.endswith(f"({CONTACT})") + + monkeypatch.setattr( + zimscraperlib.inputs, # pyright: ignore[reportAttributeAccessIssue] + "stream_file", + mock_stream_file, + raising=True, + ) + handle_user_provided_file(source=valid_http_url) + + +def test_remote_provided_none_user_agent(valid_http_url, monkeypatch): + def mock_stream_file(**kwargs): + assert kwargs.get("headers") is None + + monkeypatch.setattr( + zimscraperlib.inputs, # pyright: ignore[reportAttributeAccessIssue] + "stream_file", + mock_stream_file, + raising=True, + ) + handle_user_provided_file(source=valid_http_url, user_agent=None) + + TEXT_NOT_USED = "text not used" LONG_TEXT = (