Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions src/crawlee/_browserforge_workaround.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,26 @@ def patch_browserforge() -> None:
from typing import Dict
import apify_fingerprint_datapoints # type:ignore[import-untyped]
from browserforge import download
from browserforge.download import DATA_FILES

# Needed to be done before the import of code that does import time download
download.DATA_DIRS: Dict[str, Path] = { # type:ignore[misc]
'headers': apify_fingerprint_datapoints.get_header_network().parent,
'fingerprints': apify_fingerprint_datapoints.get_fingerprint_network().parent,
}

def DownloadIfNotExists(**flags: bool) -> None:
pass

download.DownloadIfNotExists = DownloadIfNotExists

import browserforge.bayesian_network

class BayesianNetwork(browserforge.bayesian_network.BayesianNetwork):
def __init__(self, path: Path) -> None:
if path.name in DATA_FILES['headers']:
path = download.DATA_DIRS['headers'] / path.name
"""Inverted mapping as browserforge expects somewhat renamed file names."""
if path.name in download.DATA_FILES['headers']:
path = download.DATA_DIRS['headers'] / download.DATA_FILES['headers'][path.name]
else:
path = download.DATA_DIRS['fingerprints'] / path.name
path = download.DATA_DIRS['fingerprints'] / download.DATA_FILES['fingerprints'][path.name]
super().__init__(path)

browserforge.bayesian_network.BayesianNetwork = BayesianNetwork # type:ignore[misc]
Expand Down
7 changes: 6 additions & 1 deletion src/crawlee/fingerprint_suite/_browserforge_adapter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import os.path
from collections.abc import Iterable
from copy import deepcopy
from functools import reduce
Expand Down Expand Up @@ -252,7 +253,11 @@ def generate(self, browser_type: SupportedBrowserType = 'chromium') -> dict[str,

def get_available_header_network() -> dict:
"""Get header network that contains possible header values."""
return extract_json(DATA_DIR / 'header-network.zip')
if os.path.isfile(DATA_DIR / 'header-network.zip'):
return extract_json(DATA_DIR / 'header-network.zip')
if os.path.isfile(DATA_DIR / 'header-network-definition.zip'):
return extract_json(DATA_DIR / 'header-network-definition.zip')
raise FileNotFoundError('Missing header-network file.')


def get_available_header_values(header_network: dict, node_name: str | set[str]) -> set[str]:
Expand Down
Loading