diff --git a/src/crawlee/_browserforge_workaround.py b/src/crawlee/_browserforge_workaround.py index d191e9f658..e165c49b4c 100644 --- a/src/crawlee/_browserforge_workaround.py +++ b/src/crawlee/_browserforge_workaround.py @@ -9,21 +9,26 @@ def patch_browserforge() -> None: from typing import Dict import apify_fingerprint_datapoints # type:ignore[import-untyped] from browserforge import download - from browserforge.download import DATA_FILES - # Needed to be done before the import of code that does import time download download.DATA_DIRS: Dict[str, Path] = { # type:ignore[misc] 'headers': apify_fingerprint_datapoints.get_header_network().parent, 'fingerprints': apify_fingerprint_datapoints.get_fingerprint_network().parent, } + + def DownloadIfNotExists(**flags: bool) -> None: + pass + + download.DownloadIfNotExists = DownloadIfNotExists + import browserforge.bayesian_network class BayesianNetwork(browserforge.bayesian_network.BayesianNetwork): def __init__(self, path: Path) -> None: - if path.name in DATA_FILES['headers']: - path = download.DATA_DIRS['headers'] / path.name + """Inverted mapping as browserforge expects somewhat renamed file names.""" + if path.name in download.DATA_FILES['headers']: + path = download.DATA_DIRS['headers'] / download.DATA_FILES['headers'][path.name] else: - path = download.DATA_DIRS['fingerprints'] / path.name + path = download.DATA_DIRS['fingerprints'] / download.DATA_FILES['fingerprints'][path.name] super().__init__(path) browserforge.bayesian_network.BayesianNetwork = BayesianNetwork # type:ignore[misc] diff --git a/src/crawlee/fingerprint_suite/_browserforge_adapter.py b/src/crawlee/fingerprint_suite/_browserforge_adapter.py index aba4028e5f..7dfdc799ac 100644 --- a/src/crawlee/fingerprint_suite/_browserforge_adapter.py +++ b/src/crawlee/fingerprint_suite/_browserforge_adapter.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os.path from collections.abc import Iterable from copy import deepcopy from functools import reduce @@ -252,7 +253,11 @@ def generate(self, browser_type: SupportedBrowserType = 'chromium') -> dict[str, def get_available_header_network() -> dict: """Get header network that contains possible header values.""" - return extract_json(DATA_DIR / 'header-network.zip') + if os.path.isfile(DATA_DIR / 'header-network.zip'): + return extract_json(DATA_DIR / 'header-network.zip') + if os.path.isfile(DATA_DIR / 'header-network-definition.zip'): + return extract_json(DATA_DIR / 'header-network-definition.zip') + raise FileNotFoundError('Missing header-network file.') def get_available_header_values(header_network: dict, node_name: str | set[str]) -> set[str]: