Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ install-dev:
uv sync --all-extras
uv run pre-commit install
uv run playwright install
uv run python -m browserforge update

build:
uv build --verbose
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ keywords = [
"scraping",
]
dependencies = [
"apify_fingerprint_datapoints>=0.0.2",
"browserforge>=1.2.3",
"cachetools>=5.5.0",
"colorama>=0.4.0",
Expand Down
35 changes: 35 additions & 0 deletions src/crawlee/_browserforge_workaround.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# ruff: noqa
def patch_browserforge() -> None:
"""Patches `browserforge` to use data from `apify_fingerprint_datapoints`.

This avoids import time or runtime file downloads."""

# Temporary fix until https://github.com/daijro/browserforge/pull/29 is merged
from pathlib import Path
from typing import Dict
import apify_fingerprint_datapoints # type:ignore[import-untyped]
from browserforge import download
from browserforge.download import DATA_FILES

# Needed to be done before the import of code that does import time download
download.DATA_DIRS: Dict[str, Path] = { # type:ignore[misc]
'headers': apify_fingerprint_datapoints.get_header_network().parent,
'fingerprints': apify_fingerprint_datapoints.get_fingerprint_network().parent,
}
import browserforge.bayesian_network

class BayesianNetwork(browserforge.bayesian_network.BayesianNetwork):
def __init__(self, path: Path) -> None:
if path.name in DATA_FILES['headers']:
path = download.DATA_DIRS['headers'] / path.name
else:
path = download.DATA_DIRS['fingerprints'] / path.name
super().__init__(path)

browserforge.bayesian_network.BayesianNetwork = BayesianNetwork # type:ignore[misc]
import browserforge.headers.generator

browserforge.headers.generator.DATA_DIR = download.DATA_DIRS['headers']
import browserforge.fingerprints.generator

browserforge.headers.generator.DATA_DIR = download.DATA_DIRS['fingerprints']
6 changes: 6 additions & 0 deletions src/crawlee/browsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@

_install_import_hook(__name__)

# Due to patch_browserforge
# ruff: noqa
from .._browserforge_workaround import patch_browserforge

patch_browserforge()

# The following imports are wrapped in try_import to handle optional dependencies,
# ensuring the module can still function even if these dependencies are missing.
with _try_import(__name__, 'BrowserPool'):
Expand Down
6 changes: 6 additions & 0 deletions src/crawlee/fingerprint_suite/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# Due to patch_browserforge
# ruff: noqa
from .._browserforge_workaround import patch_browserforge

patch_browserforge()

from ._browserforge_adapter import BrowserforgeFingerprintGenerator as DefaultFingerprintGenerator
from ._fingerprint_generator import FingerprintGenerator
from ._header_generator import HeaderGenerator
Expand Down
13 changes: 12 additions & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading