From 0304438a46b0b81904a24bac8d076e558413bb15 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 10 Mar 2025 09:00:02 +0100 Subject: [PATCH 1/2] Use apify_fingerprint_datapoints package instead of file downloads --- browserforge/download.py | 134 +++------------------ browserforge/fingerprints/data/__init__.py | 0 browserforge/fingerprints/generator.py | 4 +- browserforge/headers/data/__init__.py | 0 browserforge/headers/generator.py | 14 +-- pyproject.toml | 1 + 6 files changed, 27 insertions(+), 126 deletions(-) delete mode 100644 browserforge/fingerprints/data/__init__.py delete mode 100644 browserforge/headers/data/__init__.py diff --git a/browserforge/download.py b/browserforge/download.py index 22f2ab1..71087d6 100644 --- a/browserforge/download.py +++ b/browserforge/download.py @@ -1,102 +1,11 @@ -import shutil -import urllib.request -from concurrent.futures import ThreadPoolExecutor, as_completed -from datetime import datetime, timedelta -from pathlib import Path -from typing import Dict, Iterator import click """ -Downloads the required model definitions +Downloads the required model definitions - deprecated """ -ROOT_DIR: Path = Path(__file__).parent - -"""Constants for headers and fingerprints data""" -DATA_DIRS: Dict[str, Path] = { - "headers": ROOT_DIR / "headers/data", - "fingerprints": ROOT_DIR / "fingerprints/data", -} -DATA_FILES: Dict[str, Dict[str, str]] = { - "headers": { - "browser-helper-file.json": "browser-helper-file.json", - "header-network.zip": "header-network-definition.zip", - "headers-order.json": "headers-order.json", - "input-network.zip": "input-network-definition.zip", - }, - "fingerprints": { - "fingerprint-network.zip": "fingerprint-network-definition.zip", - }, -} -REMOTE_PATHS: Dict[str, str] = { - "headers": "https://github.com/apify/fingerprint-suite/raw/master/packages/header-generator/src/data_files", - "fingerprints": "https://github.com/apify/fingerprint-suite/raw/master/packages/fingerprint-generator/src/data_files", -} - - -class DownloadException(Exception): - """Raises when the download fails.""" - - -class DataDownloader: - """ - Download and extract data files for both headers and fingerprints. - """ - - def __init__(self, **kwargs: bool) -> None: - self.options = _enabled_flags(kwargs) - - def download_file(self, url: str, path: str) -> None: - """ - Download a file from the specified URL and save it to the given path. - """ - with urllib.request.urlopen(url) as resp: # nosec - if resp.status != 200: - raise DownloadException(f"Download failed with status code: {resp.status}") - with open(path, "wb") as f: - shutil.copyfileobj(resp, f) - - def download(self) -> None: - """ - Download and extract data files for both headers and fingerprints. - """ - futures = {} - with ThreadPoolExecutor(10) as executor: - for data_type in self.options: - for local_name, remote_name in DATA_FILES[data_type].items(): - url = f"{REMOTE_PATHS[data_type]}/{remote_name}" - path = str(DATA_DIRS[data_type] / local_name) - future = executor.submit(self.download_file, url, path) - futures[future] = local_name - for f in as_completed(futures): - try: - future.result() - click.secho(f"{futures[f]:<30}OK!", fg="green") - except Exception as e: - click.secho(f"Error downloading {local_name}: {e}", fg="red") - - -def _enabled_flags(flags: Dict[str, bool]) -> Iterator[str]: - """ - Returns a list of enabled flags based on a given dictionary - """ - for flag, enabled in flags.items(): - if enabled: - yield flag - - -def _get_all_paths(**flags: bool) -> Iterator[Path]: - """ - Yields all the paths to the downloaded data files - """ - for data_type in _enabled_flags(flags): - data_path = DATA_DIRS[data_type] - for local_name, _ in DATA_FILES[data_type].items(): - yield data_path / local_name - - """ Public download functions """ @@ -104,44 +13,35 @@ def _get_all_paths(**flags: bool) -> Iterator[Path]: def Download(headers=False, fingerprints=False) -> None: """ - Download the required data files + Deprecated. Downloading model definition files is no longer needed. + + Files are included as explicit python package dependency. """ - # Announce that files are being downloaded - click.secho('Downloading model definition files...', fg='bright_yellow') - try: - DataDownloader(headers=headers, fingerprints=fingerprints).download() - except KeyboardInterrupt: - print("Download interrupted.") - Remove() - exit() + click.secho('Deprecated. Downloading model definition files is no longer needed.', fg='bright_yellow') def DownloadIfNotExists(**flags: bool) -> None: """ - Download the required data files if they don't exist + Deprecated. Downloading model definition files is no longer needed. + + Files are included as explicit python package dependency. """ - if not IsDownloaded(**flags): - Download(**flags) + pass def IsDownloaded(**flags: bool) -> bool: """ - Check if the required data files are already downloaded and not older than a month. - Returns True if all the requested data files are present and not older than a month, False otherwise. - """ - for path in _get_all_paths(**flags): - if not path.exists(): - return False + Deprecated. Downloading model definition files is no longer needed. - # Check if the file is older than a month - file_creation_time = datetime.fromtimestamp(path.stat().st_ctime) - one_month_ago = datetime.now() - timedelta(weeks=5) - return file_creation_time >= one_month_ago + Files are included as explicit python package dependency. + """ + return True def Remove() -> None: """ - Deletes all downloaded data files + Deprecated. Downloading model definition files is no longer needed. + + Files are included as explicit python package dependency. """ - for path in _get_all_paths(headers=True, fingerprints=True): - path.unlink(missing_ok=True) + pass diff --git a/browserforge/fingerprints/data/__init__.py b/browserforge/fingerprints/data/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/browserforge/fingerprints/generator.py b/browserforge/fingerprints/generator.py index 052a17c..150c663 100644 --- a/browserforge/fingerprints/generator.py +++ b/browserforge/fingerprints/generator.py @@ -2,6 +2,8 @@ from pathlib import Path from typing import Dict, List, Optional +from apify_fingerprint_datapoints import get_fingerprint_network + from browserforge.bayesian_network import BayesianNetwork, get_possible_values from browserforge.headers import HeaderGenerator from browserforge.headers.utils import get_user_agent @@ -128,7 +130,7 @@ def is_set(self) -> bool: class FingerprintGenerator: """Generates realistic browser fingerprints""" - fingerprint_generator_network = BayesianNetwork(DATA_DIR / "fingerprint-network.zip") + fingerprint_generator_network = BayesianNetwork(get_fingerprint_network()) def __init__( self, diff --git a/browserforge/headers/data/__init__.py b/browserforge/headers/data/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/browserforge/headers/generator.py b/browserforge/headers/generator.py index 876ee16..05883d1 100644 --- a/browserforge/headers/generator.py +++ b/browserforge/headers/generator.py @@ -1,8 +1,9 @@ from dataclasses import dataclass -from pathlib import Path from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union from browserforge.bayesian_network import BayesianNetwork, get_possible_values +from apify_fingerprint_datapoints import (get_header_network, get_headers_order, get_browser_helper_file, + get_input_network) from .utils import get_browser, get_user_agent, pascalize_headers, tuplify @@ -35,7 +36,6 @@ 'sec-fetch-site': '?1', 'sec-fetch-user': 'document', } -DATA_DIR: Path = Path(__file__).parent / 'data' ListOrString: TypeAlias = Union[Tuple[str, ...], List[str], str] @@ -83,8 +83,8 @@ class HeaderGenerator: relaxation_order: Tuple[str, ...] = ('locales', 'devices', 'operatingSystems', 'browsers') # Initialize networks - input_generator_network = BayesianNetwork(DATA_DIR / "input-network.zip") - header_generator_network = BayesianNetwork(DATA_DIR / "header-network.zip") + input_generator_network = BayesianNetwork(get_input_network()) + header_generator_network = BayesianNetwork(get_header_network()) def __init__( self, @@ -433,8 +433,7 @@ def _load_headers_order(self) -> Dict[str, List[str]]: Returns: Dict[str, List[str]]: Dictionary of headers order for each browser. """ - headers_order_path = DATA_DIR / "headers-order.json" - return json.loads(headers_order_path.read_bytes()) + return json.loads(get_headers_order().read_bytes()) def _load_unique_browsers(self) -> List[HttpBrowserObject]: """ @@ -443,8 +442,7 @@ def _load_unique_browsers(self) -> List[HttpBrowserObject]: Returns: List[HttpBrowserObject]: List of HttpBrowserObject instances. """ - browser_helper_path = DATA_DIR / 'browser-helper-file.json' - unique_browser_strings = json.loads(browser_helper_path.read_bytes()) + unique_browser_strings = json.loads(get_browser_helper_file().read_bytes()) return [ self._prepare_http_browser_object(browser_str) for browser_str in unique_browser_strings diff --git a/pyproject.toml b/pyproject.toml index acac848..d3b9bf5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ python = "^3.8" click = "*" orjson = { version = "*", optional = true } typing_extensions = {version = "*", python = "<3.10"} +apify_fingerprint_datapoints = "*" [tool.poetry.extras] all = ["orjson"] From db16424295bb8c1d317601eff6022e178fef1603 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 10 Mar 2025 09:27:38 +0100 Subject: [PATCH 2/2] Remove files related cli. Update docs. Bump version. --- README.md | 11 ----------- browserforge/__main__.py | 39 --------------------------------------- pyproject.toml | 2 +- 3 files changed, 1 insertion(+), 51 deletions(-) delete mode 100644 browserforge/__main__.py diff --git a/README.md b/README.md index 709c65d..7d853c8 100644 --- a/README.md +++ b/README.md @@ -50,15 +50,7 @@ It is a reimplementation of [Apify's fingerprint-suite](https://github.com/apify ``` pip install browserforge[all] -python -m browserforge update ``` - -The `[all]` extra will include optional libraries like orjson. - -Use `python -m browserforge update` to fetch necessary model files. If the command is not run, files will be downloaded on the first import. - -
- ## Usage ## Generating Headers @@ -594,10 +586,7 @@ Parameters: ## Uninstall -To fully remove all files, run the following commands: - ``` -python -m browserforge remove pip uninstall browserforge ``` diff --git a/browserforge/__main__.py b/browserforge/__main__.py deleted file mode 100644 index 8429688..0000000 --- a/browserforge/__main__.py +++ /dev/null @@ -1,39 +0,0 @@ -import click - -from browserforge.download import Download, Remove - - -class DownloadException(Exception): - """Raises when the download fails.""" - - -@click.group() -def cli() -> None: - pass - - -@cli.command(name='update') -@click.option('--headers', is_flag=True, help='Only update header definitions') -@click.option('--fingerprints', is_flag=True, help='Only update fingerprint definitions') -def update(headers=False, fingerprints=False): - """ - Fetches header and fingerprint definitions - """ - # if no options passed, mark both as True - if not headers ^ fingerprints: - headers = fingerprints = True - - Download(headers=headers, fingerprints=fingerprints) - - -@cli.command(name='remove') -def remove(): - """ - Remove all downloaded files - """ - Remove() - click.secho('Removed all files!', fg='bright_yellow') - - -if __name__ == '__main__': - cli() diff --git a/pyproject.toml b/pyproject.toml index d3b9bf5..2101d42 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "browserforge" -version = "1.2.3" +version = "1.2.4" description = "Intelligent browser header & fingerprint generator" authors = ["daijro "] license = "Apache-2.0"