diff --git a/Makefile b/Makefile index c7d9a5f45e..8a2a54397f 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,6 @@ install-dev: uv sync --all-extras uv run pre-commit install uv run playwright install - uv run python -m browserforge update build: uv build --verbose diff --git a/pyproject.toml b/pyproject.toml index 1859e24a58..c8beb470ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ keywords = [ "scraping", ] dependencies = [ + "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3", "cachetools>=5.5.0", "colorama>=0.4.0", diff --git a/src/crawlee/_browserforge_workaround.py b/src/crawlee/_browserforge_workaround.py new file mode 100644 index 0000000000..d191e9f658 --- /dev/null +++ b/src/crawlee/_browserforge_workaround.py @@ -0,0 +1,35 @@ +# ruff: noqa +def patch_browserforge() -> None: + """Patches `browserforge` to use data from `apify_fingerprint_datapoints`. + + This avoids import time or runtime file downloads.""" + + # Temporary fix until https://github.com/daijro/browserforge/pull/29 is merged + from pathlib import Path + from typing import Dict + import apify_fingerprint_datapoints # type:ignore[import-untyped] + from browserforge import download + from browserforge.download import DATA_FILES + + # Needed to be done before the import of code that does import time download + download.DATA_DIRS: Dict[str, Path] = { # type:ignore[misc] + 'headers': apify_fingerprint_datapoints.get_header_network().parent, + 'fingerprints': apify_fingerprint_datapoints.get_fingerprint_network().parent, + } + import browserforge.bayesian_network + + class BayesianNetwork(browserforge.bayesian_network.BayesianNetwork): + def __init__(self, path: Path) -> None: + if path.name in DATA_FILES['headers']: + path = download.DATA_DIRS['headers'] / path.name + else: + path = download.DATA_DIRS['fingerprints'] / path.name + super().__init__(path) + + browserforge.bayesian_network.BayesianNetwork = BayesianNetwork # type:ignore[misc] + import browserforge.headers.generator + + browserforge.headers.generator.DATA_DIR = download.DATA_DIRS['headers'] + import browserforge.fingerprints.generator + + browserforge.headers.generator.DATA_DIR = download.DATA_DIRS['fingerprints'] diff --git a/src/crawlee/browsers/__init__.py b/src/crawlee/browsers/__init__.py index d6c32ca9e5..ab4d2fa5a9 100644 --- a/src/crawlee/browsers/__init__.py +++ b/src/crawlee/browsers/__init__.py @@ -3,6 +3,12 @@ _install_import_hook(__name__) +# Due to patch_browserforge +# ruff: noqa +from .._browserforge_workaround import patch_browserforge + +patch_browserforge() + # The following imports are wrapped in try_import to handle optional dependencies, # ensuring the module can still function even if these dependencies are missing. with _try_import(__name__, 'BrowserPool'): diff --git a/src/crawlee/fingerprint_suite/__init__.py b/src/crawlee/fingerprint_suite/__init__.py index 4b604b1ff5..83251b7151 100644 --- a/src/crawlee/fingerprint_suite/__init__.py +++ b/src/crawlee/fingerprint_suite/__init__.py @@ -1,3 +1,9 @@ +# Due to patch_browserforge +# ruff: noqa +from .._browserforge_workaround import patch_browserforge + +patch_browserforge() + from ._browserforge_adapter import BrowserforgeFingerprintGenerator as DefaultFingerprintGenerator from ._fingerprint_generator import FingerprintGenerator from ._header_generator import HeaderGenerator diff --git a/uv.lock b/uv.lock index aa271d4342..6d0f3f4453 100644 --- a/uv.lock +++ b/uv.lock @@ -40,6 +40,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/46/eb/e7f063ad1fec6b3178a3cd82d1a3c4de82cccf283fc42746168188e1cdd5/anyio-4.8.0-py3-none-any.whl", hash = "sha256:b5011f270ab5eb0abf13385f851315585cc37ef330dd88e27ec3d34d651fd47a", size = 96041 }, ] +[[package]] +name = "apify-fingerprint-datapoints" +version = "0.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/d8/315b56dcca35a1b434e8a18bc1f4694c3ddc6fc8d7f8c19a3c983db15b9f/apify_fingerprint_datapoints-0.0.2.tar.gz", hash = "sha256:96e6f103774c7b64bb91c6a6a98315cdec340e154636b048824d3bd53401116f", size = 1145046 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/29/0bc61fa8d357bd5f0f14c7250f19c984c3ed402575e8e2283ff58ba851a6/apify_fingerprint_datapoints-0.0.2-py3-none-any.whl", hash = "sha256:fc0a3707353d98064ac51541880290dc2dbbd2898affcf770d6762836812b9c1", size = 836233 }, +] + [[package]] name = "arrow" version = "1.3.0" @@ -581,6 +590,7 @@ name = "crawlee" version = "0.6.3" source = { editable = "." } dependencies = [ + { name = "apify-fingerprint-datapoints" }, { name = "browserforge" }, { name = "cachetools" }, { name = "colorama" }, @@ -666,6 +676,7 @@ dev = [ [package.metadata] requires-dist = [ + { name = "apify-fingerprint-datapoints", specifier = ">=0.0.2" }, { name = "beautifulsoup4", extras = ["lxml"], marker = "extra == 'all'", specifier = ">=4.12.0" }, { name = "beautifulsoup4", extras = ["lxml"], marker = "extra == 'beautifulsoup'", specifier = ">=4.12.0" }, { name = "browserforge", specifier = ">=1.2.3" }, @@ -1113,7 +1124,7 @@ name = "importlib-metadata" version = "8.6.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "zipp" }, + { name = "zipp", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/33/08/c1395a292bb23fd03bdf572a1357c5a733d3eecbab877641ceacab23db6e/importlib_metadata-8.6.1.tar.gz", hash = "sha256:310b41d755445d74569f993ccfc22838295d9fe005425094fad953d7f15c8580", size = 55767 } wheels = [