Skip to content

feat: mask Playwright's "headless" headers #545

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,12 +235,12 @@ def __init__(self, headers: Mapping[str, str] | None = None) -> None:
"""
# Ensure immutability by sorting and fixing the order.
headers = headers or {}
headers = {k.lower(): v for k, v in headers.items()}
headers = {k.capitalize(): v for k, v in headers.items()}
self._headers = dict(sorted(headers.items()))

def __getitem__(self, key: str) -> str:
"""Get the value of a header by its name, case-insensitive."""
return self._headers[key.lower()]
return self._headers[key.capitalize()]

def __iter__(self) -> Iterator[str]:
"""Return an iterator over the header names."""
Expand All @@ -261,3 +261,13 @@ def __setitem__(self, key: str, value: str) -> None:
def __delitem__(self, key: str) -> None:
"""Prevent deleting a header, as the object is immutable."""
raise TypeError(f'{self.__class__.__name__} is immutable')

def __or__(self, other: Mapping[str, str]) -> HttpHeaders:
"""Return a new instance of `HttpHeaders` combining this one with another one."""
combined_headers = {**self._headers, **other}
return HttpHeaders(combined_headers)

def __ror__(self, other: Mapping[str, str]) -> HttpHeaders:
"""Support reversed | operation (other | self)."""
combined_headers = {**other, **self._headers}
return HttpHeaders(combined_headers)
6 changes: 6 additions & 0 deletions src/crawlee/browsers/_base_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from playwright.async_api import Page

from crawlee.browsers._types import BrowserType
from crawlee.proxy_configuration import ProxyInfo


Expand Down Expand Up @@ -50,6 +51,11 @@ def has_free_capacity(self) -> bool:
def is_browser_connected(self) -> bool:
"""Return if the browser is closed."""

@property
@abstractmethod
def browser_type(self) -> BrowserType:
"""Return the type of the browser."""

@abstractmethod
async def new_page(
self,
Expand Down
5 changes: 3 additions & 2 deletions src/crawlee/browsers/_base_browser_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Literal
from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
from collections.abc import Mapping
from types import TracebackType

from crawlee.browsers._base_browser_controller import BaseBrowserController
from crawlee.browsers._types import BrowserType


class BaseBrowserPlugin(ABC):
Expand All @@ -24,7 +25,7 @@ class BaseBrowserPlugin(ABC):

@property
@abstractmethod
def browser_type(self) -> Literal['chromium', 'firefox', 'webkit']:
def browser_type(self) -> BrowserType:
"""Return the browser type name."""

@property
Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/browsers/_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
from collections import defaultdict
from datetime import timedelta
from logging import getLogger
from typing import TYPE_CHECKING, Any, Literal
from typing import TYPE_CHECKING, Any
from weakref import WeakValueDictionary

from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.recurring_task import RecurringTask
from crawlee.browsers._base_browser_controller import BaseBrowserController
from crawlee.browsers._playwright_browser_plugin import PlaywrightBrowserPlugin
from crawlee.browsers._types import CrawleePage
from crawlee.browsers._types import BrowserType, CrawleePage

if TYPE_CHECKING:
from collections.abc import Mapping, Sequence
Expand Down Expand Up @@ -94,7 +94,7 @@ def with_default_plugin(
cls,
*,
headless: bool | None = None,
browser_type: Literal['chromium', 'firefox', 'webkit'] | None = None,
browser_type: BrowserType | None = None,
**kwargs: Any,
) -> BrowserPool:
"""Create a new instance with a single `BaseBrowserPlugin` configured with the provided options.
Expand Down
67 changes: 54 additions & 13 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
from __future__ import annotations

from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, cast

from playwright.async_api import Page
from playwright.async_api import BrowserContext, Page, ProxySettings
from typing_extensions import override

from crawlee.browsers._base_browser_controller import BaseBrowserController
from crawlee.browsers._types import BrowserType
from crawlee.fingerprint_suite import HeaderGenerator

if TYPE_CHECKING:
from collections.abc import Mapping
Expand All @@ -26,17 +28,29 @@ class PlaywrightBrowserController(BaseBrowserController):
"""

AUTOMATION_LIBRARY = 'playwright'
_DEFAULT_HEADER_GENERATOR = HeaderGenerator()

def __init__(self, browser: Browser, *, max_open_pages_per_browser: int = 20) -> None:
def __init__(
self,
browser: Browser,
*,
max_open_pages_per_browser: int = 20,
header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,
) -> None:
"""Create a new instance.

Args:
browser: The browser instance to control.
max_open_pages_per_browser: The maximum number of pages that can be open at the same time.
header_generator: An optional `HeaderGenerator` instance used to generate and manage HTTP headers for
requests made by the browser. By default, a predefined header generator is used. Set to `None` to
disable automatic header modifications.
"""
self._browser = browser
self._max_open_pages_per_browser = max_open_pages_per_browser
self._header_generator = header_generator

self._browser_context: BrowserContext | None = None
self._pages = list[Page]()
self._last_page_opened_at = datetime.now(timezone.utc)

Expand Down Expand Up @@ -70,26 +84,25 @@ def has_free_capacity(self) -> bool:
def is_browser_connected(self) -> bool:
return self._browser.is_connected()

@property
@override
def browser_type(self) -> BrowserType:
return cast(BrowserType, self._browser.browser_type.name)

@override
async def new_page(
self,
page_options: Mapping[str, Any] | None = None,
proxy_info: ProxyInfo | None = None,
) -> Page:
page_options = dict(page_options) if page_options else {}

# If "proxy_info" is provided and no proxy is already set in "page_options", configure the proxy.
if proxy_info and 'proxy' not in page_options:
page_options['proxy'] = {
'server': f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
'username': proxy_info.username,
'password': proxy_info.password,
}
if not self._browser_context:
self._browser_context = await self._create_browser_context(proxy_info)

if not self.has_free_capacity:
raise ValueError('Cannot open more pages in this browser.')

page = await self._browser.new_page(**page_options)
page_options = dict(page_options) if page_options else {}
page = await self._browser_context.new_page(**page_options)

# Handle page close event
page.on(event='close', f=self._on_page_close)
Expand All @@ -114,3 +127,31 @@ async def close(self, *, force: bool = False) -> None:
def _on_page_close(self, page: Page) -> None:
"""Handle actions after a page is closed."""
self._pages.remove(page)

async def _create_browser_context(self, proxy_info: ProxyInfo | None = None) -> BrowserContext:
"""Create a new browser context with the specified proxy settings."""
if self._header_generator:
common_headers = self._header_generator.get_common_headers()
sec_ch_ua_headers = self._header_generator.get_sec_ch_ua_headers(browser_type=self.browser_type)
user_agent_header = self._header_generator.get_user_agent_header(browser_type=self.browser_type)
extra_http_headers = dict(common_headers | sec_ch_ua_headers | user_agent_header)
user_agent = user_agent_header.get('User-Agent')
else:
extra_http_headers = None
user_agent = None

proxy = (
ProxySettings(
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
username=proxy_info.username,
password=proxy_info.password,
)
if proxy_info
else None
)

return await self._browser.new_context(
user_agent=user_agent,
extra_http_headers=extra_http_headers,
proxy=proxy,
)
8 changes: 5 additions & 3 deletions src/crawlee/browsers/_playwright_browser_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import annotations

from logging import getLogger
from typing import TYPE_CHECKING, Any, Literal
from typing import TYPE_CHECKING, Any

from playwright.async_api import Playwright, async_playwright
from typing_extensions import override
Expand All @@ -15,6 +15,8 @@
from collections.abc import Mapping
from types import TracebackType

from crawlee.browsers._types import BrowserType

logger = getLogger(__name__)


Expand All @@ -29,7 +31,7 @@ class PlaywrightBrowserPlugin(BaseBrowserPlugin):
def __init__(
self,
*,
browser_type: Literal['chromium', 'firefox', 'webkit'] = 'chromium',
browser_type: BrowserType = 'chromium',
browser_options: Mapping[str, Any] | None = None,
page_options: Mapping[str, Any] | None = None,
max_open_pages_per_browser: int = 20,
Expand All @@ -53,7 +55,7 @@ def __init__(

@property
@override
def browser_type(self) -> Literal['chromium', 'firefox', 'webkit']:
def browser_type(self) -> BrowserType:
return self._browser_type

@property
Expand Down
4 changes: 3 additions & 1 deletion src/crawlee/browsers/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
if TYPE_CHECKING:
from playwright.async_api import Page

BrowserType = Literal['chromium', 'firefox', 'webkit']


@dataclass
class CrawleePage:
"""Represents a page object within a browser, with additional metadata for tracking and management."""

id: str
browser_type: Literal['chromium', 'firefox', 'webkit']
browser_type: BrowserType
page: Page
11 changes: 11 additions & 0 deletions src/crawlee/fingerprint_suite/_consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,17 @@

COMMON_ACCEPT_LANGUAGE = 'en-US,en;q=0.9'

# Playwright default headers (user-agents and sec-ch) for headless browsers.
PW_CHROMIUM_HEADLESS_DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA = '"Not=A?Brand";v="8", "Chromium";v="124", "Google Chrome";v="124"'
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_MOBILE = '?0'
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_PLATFORM = '"macOS"'

PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT = (
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv125.0) Gecko/20100101 Firefox/125.0'
)
PW_WEBKIT_HEADLESS_DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15'

# Random 1000 user agents from Apify fingerprint dataset.
USER_AGENT_POOL = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
Expand Down
84 changes: 72 additions & 12 deletions src/crawlee/fingerprint_suite/_header_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,86 @@
import random
from typing import TYPE_CHECKING

from crawlee.fingerprint_suite._consts import COMMON_ACCEPT, COMMON_ACCEPT_LANGUAGE, USER_AGENT_POOL
from crawlee._types import HttpHeaders
from crawlee.fingerprint_suite._consts import (
COMMON_ACCEPT,
COMMON_ACCEPT_LANGUAGE,
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA,
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_MOBILE,
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_PLATFORM,
PW_CHROMIUM_HEADLESS_DEFAULT_USER_AGENT,
PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT,
PW_WEBKIT_HEADLESS_DEFAULT_USER_AGENT,
USER_AGENT_POOL,
)

if TYPE_CHECKING:
from collections.abc import Mapping
from crawlee.browsers._types import BrowserType


class HeaderGenerator:
"""Generates common headers for HTTP requests."""
"""Generates realistic looking or browser-like HTTP headers."""

def get_common_headers(self) -> Mapping[str, str]:
"""Get common headers for HTTP requests.
def get_common_headers(self) -> HttpHeaders:
"""Get common HTTP headers ("Accept", "Accept-Language").

We do not modify the 'Accept-Encoding', 'Connection' and other headers. They should be included and handled
by the HTTP client.

Returns:
Dictionary containing common headers.
We do not modify the "Accept-Encoding", "Connection" and other headers. They should be included and handled
by the HTTP client or browser.
"""
return {
headers = {
'Accept': COMMON_ACCEPT,
'Accept-Language': COMMON_ACCEPT_LANGUAGE,
'User-Agent': random.choice(USER_AGENT_POOL),
}
return HttpHeaders(headers)

def get_random_user_agent_header(self) -> HttpHeaders:
"""Get a random User-Agent header."""
headers = {'User-Agent': random.choice(USER_AGENT_POOL)}
return HttpHeaders(headers)

def get_user_agent_header(
self,
*,
browser_type: BrowserType = 'chromium',
) -> HttpHeaders:
"""Get the User-Agent header based on the browser type."""
headers = dict[str, str]()

if browser_type == 'chromium':
headers['User-Agent'] = PW_CHROMIUM_HEADLESS_DEFAULT_USER_AGENT

elif browser_type == 'firefox':
headers['User-Agent'] = PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT

elif browser_type == 'webkit':
headers['User-Agent'] = PW_WEBKIT_HEADLESS_DEFAULT_USER_AGENT

else:
raise ValueError(f'Unsupported browser type: {browser_type}')

return HttpHeaders(headers)

def get_sec_ch_ua_headers(
self,
*,
browser_type: BrowserType = 'chromium',
) -> HttpHeaders:
"""Get the Sec-Ch-Ua headers based on the browser type."""
headers = dict[str, str]()

if browser_type == 'chromium':
# Currently, only Chromium uses Sec-Ch-Ua headers.
headers['Sec-Ch-Ua'] = PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA
headers['Sec-Ch-Ua-Mobile'] = PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_MOBILE
headers['Sec-Ch-Ua-Platform'] = PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_PLATFORM

elif browser_type == 'firefox': # noqa: SIM114
pass

elif browser_type == 'webkit':
pass

else:
raise ValueError(f'Unsupported browser type: {browser_type}')

return HttpHeaders(headers)
Loading
Loading