diff --git a/docs/guides/avoid_blocking.mdx b/docs/guides/avoid_blocking.mdx index c0f6838a2e..daccf1c4d8 100644 --- a/docs/guides/avoid_blocking.mdx +++ b/docs/guides/avoid_blocking.mdx @@ -17,7 +17,7 @@ Browser fingerprint is a collection of browser attributes and significant featur ## Using browser fingerprints -Changing browser fingerprints can be a tedious job. Luckily, Crawlee provides this feature with minimal configuration necessary - the usage of fingerprints can be enabled in `PlaywrightCrawler` by using the `fingerprint_generator` argument of the `PlaywrightCrawler.__init__`. You can either pass your own implementation of `FingerprintGenerator` or use `DefaultFingerprintGenerator`. +Changing browser fingerprints can be a tedious job. Luckily, Crawlee provides this feature with minimal configuration necessary - the usage of fingerprints in `PlaywrightCrawler` is enabled by default. You can customize the fingerprints by using the `fingerprint_generator` argument of the `PlaywrightCrawler.__init__`, either pass your own implementation of `FingerprintGenerator` or use `DefaultFingerprintGenerator`. {PlaywrightDefaultFingerprintGenerator} @@ -29,7 +29,7 @@ In certain cases we want to narrow down the fingerprints used - e.g. specify a c {PlaywrightDefaultFingerprintGeneratorWithArgs} -If you do not want to use fingerprints, then do not pass `fingerprint_generator` argument to the `PlaywrightCrawler.__init__`. By default, fingerprints are disabled. +If you do not want to use fingerprints, then pass `fingerprint_generator=None` argument to the `PlaywrightCrawler.__init__`. ## Using Camoufox diff --git a/docs/guides/code_examples/browser_fingerprint/playwright_with_fingerprint_generator.py b/docs/guides/code_examples/browser_fingerprint/playwright_with_fingerprint_generator.py index ce6eef64d3..5e1c8d2668 100644 --- a/docs/guides/code_examples/browser_fingerprint/playwright_with_fingerprint_generator.py +++ b/docs/guides/code_examples/browser_fingerprint/playwright_with_fingerprint_generator.py @@ -1,14 +1,11 @@ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext -from crawlee.fingerprint_suite import DefaultFingerprintGenerator async def main() -> None: - crawler = PlaywrightCrawler( - # Fingerprint generator to be used. By default no fingerprint generation is done. - fingerprint_generator=DefaultFingerprintGenerator(), - ) + # Fingerprint generator is used by default. + crawler = PlaywrightCrawler() # Define the default request handler, which will be called for every request. @crawler.router.default_handler diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index bdc6402157..4b5ffba9dc 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -2,7 +2,7 @@ import logging from functools import partial -from typing import TYPE_CHECKING, Any, Callable, Generic +from typing import TYPE_CHECKING, Any, Callable, Generic, Literal from pydantic import ValidationError from typing_extensions import NotRequired, TypedDict, TypeVar @@ -14,6 +14,7 @@ from crawlee.browsers import BrowserPool from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline from crawlee.errors import SessionError +from crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions from crawlee.sessions._cookies import PlaywrightCookieParam from crawlee.statistics import StatisticsState @@ -34,7 +35,6 @@ from crawlee import RequestTransformAction from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs from crawlee.browsers._types import BrowserType - from crawlee.fingerprint_suite import FingerprintGenerator @docs_group('Classes') @@ -86,7 +86,7 @@ def __init__( user_data_dir: str | Path | None = None, browser_launch_options: Mapping[str, Any] | None = None, browser_new_context_options: Mapping[str, Any] | None = None, - fingerprint_generator: FingerprintGenerator | None = None, + fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default', headless: bool | None = None, use_incognito_pages: bool | None = None, **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]], @@ -119,7 +119,7 @@ def __init__( if browser_pool: # Raise an exception if browser_pool is provided together with other browser-related arguments. if any( - param is not None + param not in [None, 'default'] for param in ( user_data_dir, use_incognito_pages, @@ -138,6 +138,12 @@ def __init__( # If browser_pool is not provided, create a new instance of BrowserPool with specified arguments. else: + if fingerprint_generator == 'default': + generator_browser_type = None if browser_type is None else [browser_type] + fingerprint_generator = DefaultFingerprintGenerator( + header_options=HeaderGeneratorOptions(browsers=generator_browser_type) + ) + browser_pool = BrowserPool.with_default_plugin( headless=headless, browser_type=browser_type, diff --git a/src/crawlee/fingerprint_suite/_browserforge_adapter.py b/src/crawlee/fingerprint_suite/_browserforge_adapter.py index 0c36362261..aba4028e5f 100644 --- a/src/crawlee/fingerprint_suite/_browserforge_adapter.py +++ b/src/crawlee/fingerprint_suite/_browserforge_adapter.py @@ -1,13 +1,17 @@ from __future__ import annotations +from collections.abc import Iterable from copy import deepcopy -from typing import TYPE_CHECKING, Any +from functools import reduce +from operator import or_ +from typing import TYPE_CHECKING, Any, Literal from browserforge.bayesian_network import extract_json from browserforge.fingerprints import Fingerprint as bf_Fingerprint from browserforge.fingerprints import FingerprintGenerator as bf_FingerprintGenerator from browserforge.fingerprints import Screen -from browserforge.headers.generator import DATA_DIR +from browserforge.headers import Browser +from browserforge.headers.generator import DATA_DIR, ListOrString from browserforge.headers.generator import HeaderGenerator as bf_HeaderGenerator from typing_extensions import override @@ -21,7 +25,7 @@ class PatchedHeaderGenerator(bf_HeaderGenerator): - """Browserforge `HeaderGenerator` that contains patches not accepted in upstream repo.""" + """Browserforge `HeaderGenerator` that contains patches specific for our usage of the generator.""" def _get_accept_language_header(self, locales: tuple[str, ...]) -> str: """Generates the Accept-Language header based on the given locales. @@ -38,6 +42,114 @@ def _get_accept_language_header(self, locales: tuple[str, ...]) -> str: additional_locales = [f'{locale};q={0.9 - index * 0.1:.1f}' for index, locale in enumerate(locales[1:])] return ','.join((locales[0], *additional_locales)) + def generate( + self, + *, + browser: Iterable[str | Browser] | None = None, + os: ListOrString | None = None, + device: ListOrString | None = None, + locale: ListOrString | None = None, + http_version: Literal[1, 2] | None = None, + user_agent: ListOrString | None = None, + strict: bool | None = None, + request_dependent_headers: dict[str, str] | None = None, + ) -> dict[str, str]: + """Generate HTTP headers based on the specified parameters. + + For detailed description of the original method see: `browserforge.headers.generator.HeaderGenerator.generate` + This patched version of the method adds additional quality checks on the output of the original method. It tries + to generate headers several times until they match the requirements. + + The `browser` parameter accepts `chromium` as a general category, which includes not only Google Chrome + but also other Chromium-based browsers. As a result, a Safari-like user agent may be generated for a `chromium` + input, such as: + ``` + Mozilla/5.0 (iPhone; CPU iPhone OS 18_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) + CriOS/130.0.6723.90 Mobile/15E148 Safari/604.1 + ``` + To maintain consistency with previous implementations, only a subset of Chromium headers will be allowed. + + Returns: + A generated headers. + """ + # browserforge header generation can be flaky. Enforce basic QA on generated headers + max_attempts = 10 + + single_browser = self._get_single_browser_type(browser) + + if single_browser == 'chromium': + # `BrowserForge` header generator considers `chromium` in general sense and therefore will generate also + # other `Chromium` based browser headers. This adapter desires only specific subset of `chromium` headers + # that contain all 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform' headers. + # Increase max attempts as from `BrowserForge` header generator perspective even `chromium` + # headers without `sec-...` headers are valid. + max_attempts += 50 + + # Browserforge uses term 'safari', we use term 'webkit' + bf_browser_type = 'safari' if single_browser == 'webkit' else single_browser + + # Use browserforge to generate headers until it satisfies our additional requirements. + for _attempt in range(max_attempts): + generated_header: dict[str, str] = super().generate( + browser=bf_browser_type, + os=os, + device=device, + locale=locale, + http_version=http_version, + user_agent=user_agent, + strict=strict, + request_dependent_headers=request_dependent_headers, + ) + + if ('headless' in generated_header.get('User-Agent', '').lower()) or ( + 'headless' in generated_header.get('sec-ch-ua', '').lower() + ): + # It can be a valid header, but we never want to leak "headless". Get a different one. + continue + + if any( + keyword in generated_header['User-Agent'] + for keyword in self._get_expected_browser_keywords(single_browser) + ): + if single_browser == 'chromium' and not self._contains_all_sec_headers(generated_header): + # Accept chromium header only with all sec headers. + continue + + return generated_header + raise RuntimeError('Failed to generate header.') + + def _contains_all_sec_headers(self, headers: dict[str, str]) -> bool: + return all(header_name in headers for header_name in ('sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform')) + + def _get_expected_browser_keywords(self, browser: str | None) -> set[str]: + if not browser: + # Allow all possible keywords when there is no preference for specific browser type. + return reduce(or_, BROWSER_TYPE_HEADER_KEYWORD.values()) + + return BROWSER_TYPE_HEADER_KEYWORD[browser] + + def _get_single_browser_type(self, browser: Iterable[str | Browser] | None) -> str | None: + """Get single browser type. + + Browserforge header generator accepts wider range of possible types. + Narrow it to single optional string as that is how we use it. + Handling the original multitype would be pointlessly complex. + """ + # In our case we never pass more than one browser type. In general case more browsers are just bigger pool to + # select from, so narrowing it to the first one is still a valid action. + first_browser = ( + next(iter(browser)) if (isinstance(browser, Iterable) and not isinstance(browser, str)) else browser + ) + + if isinstance(first_browser, str): + single_name = first_browser + elif isinstance(first_browser, Browser): + single_name = first_browser.name + else: + single_name = None + + return single_name + class PatchedFingerprintGenerator(bf_FingerprintGenerator): """Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo.""" @@ -91,8 +203,6 @@ def __init__( screen_options: Defines the screen constrains for the fingerprint generator. mock_web_rtc: Whether to mock WebRTC when injecting the fingerprint. slim: Disables performance-heavy evasions when injecting the fingerprint. - strict: If set to `True`, it will raise error if it is not possible to generate fingerprints based on the - `options`. Default behavior is relaxation of `options` until it is possible to generate a fingerprint. """ bf_options: dict[str, Any] = {'mock_webrtc': mock_web_rtc, 'slim': slim} @@ -136,38 +246,8 @@ def __init__(self) -> None: self._generator = PatchedHeaderGenerator(locale=['en-US', 'en']) def generate(self, browser_type: SupportedBrowserType = 'chromium') -> dict[str, str]: - """Generate headers. - - browser_type = `chromium` is in general sense not just Google Chrome, but also other chromium based browsers. - For example this Safari user agent can be generated for `chromium` input: - `Mozilla/5.0 (iPhone; CPU iPhone OS 18_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) - CriOS/130.0.6723.90 Mobile/15E148 Safari/604.1` - To remain consistent with previous implementation only subset of `chromium` header will be allowed. - """ - # browserforge header generation can be flaky. Enforce basic QA on generated headers - max_attempts = 10 - - if browser_type == 'chromium': - # `BrowserForge` header generator considers `chromium` in general sense and therefore will generate also - # other `Chromium` based browser headers. This adapter desires only specific subset of `chromium` headers - # that contain all 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform' headers. - # Increase max attempts as from `BrowserForge` header generator perspective even `chromium` - # headers without `sec-...` headers are valid. - max_attempts += 50 - - bf_browser_type = 'safari' if browser_type == 'webkit' else browser_type - - for _attempt in range(max_attempts): - generated_header: dict[str, str] = self._generator.generate(browser=bf_browser_type) - if any(keyword in generated_header['User-Agent'] for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type]): - if browser_type == 'chromium' and not self._contains_all_sec_headers(generated_header): - continue - - return generated_header - raise RuntimeError('Failed to generate header.') - - def _contains_all_sec_headers(self, headers: dict[str, str]) -> bool: - return all(header_name in headers for header_name in ('sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform')) + """Generate headers.""" + return self._generator.generate(browser=browser_type) def get_available_header_network() -> dict: diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 07e5804032..8a3bde6234 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -5,7 +5,7 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal from unittest import mock from unittest.mock import Mock @@ -15,6 +15,7 @@ from crawlee.crawlers import PlaywrightCrawler from crawlee.fingerprint_suite import ( DefaultFingerprintGenerator, + FingerprintGenerator, HeaderGeneratorOptions, ScreenOptions, ) @@ -145,9 +146,22 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: assert handled_urls == set() -async def test_chromium_headless_headers(header_network: dict) -> None: +@pytest.mark.parametrize( + 'fingerprint_generator', + [ + pytest.param(None, id='No fingerprint generator. Headers generated by header generator.'), + pytest.param( + DefaultFingerprintGenerator(header_options=HeaderGeneratorOptions(browsers=['chromium'])), + id='Explicitly passed fingerprint generator.', + ), + pytest.param('default', id='Default fingerprint generator.'), + ], +) +async def test_chromium_headless_headers( + header_network: dict, fingerprint_generator: None | FingerprintGenerator | Literal['default'] +) -> None: browser_type: BrowserType = 'chromium' - crawler = PlaywrightCrawler(headless=True, browser_type=browser_type) + crawler = PlaywrightCrawler(headless=True, browser_type=browser_type, fingerprint_generator=fingerprint_generator) headers = dict[str, str]() @crawler.pre_navigation_hook @@ -169,7 +183,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: user_agent = headers.get('user-agent') assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'}) - assert any(keyword in user_agent for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type]) + assert any(keyword in user_agent for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type]), user_agent assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua') assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile')