Skip to content

Commit

Permalink
feat: add support use_incognito_pages for browser_launch_options
Browse files Browse the repository at this point in the history
…in `PlaywrightCrawler` (#941)

### Description

- Improve cookie handling for `PlaywrightCrawler`. Cookies are now
stored in the `Session` and set in Playwright Context from the
`Session`.
- Add `use_incognito_pages` option for `browser_launch_options` allowing
each new page to be launched in a separate context.

### Issues

- #722 
- #933
  • Loading branch information
Mantisus authored Feb 5, 2025
1 parent 3c89827 commit eae3a33
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 13 deletions.
6 changes: 6 additions & 0 deletions src/crawlee/browsers/_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def with_default_plugin(
browser_launch_options: Mapping[str, Any] | None = None,
browser_new_context_options: Mapping[str, Any] | None = None,
headless: bool | None = None,
use_incognito_pages: bool | None = False,
**kwargs: Any,
) -> BrowserPool:
"""Create a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
Expand All @@ -116,6 +117,8 @@ def with_default_plugin(
are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
headless: Whether to run the browser in headless mode.
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
own context that is destroyed once the page is closed or crashes.
kwargs: Additional arguments for default constructor.
"""
plugin_options: dict = defaultdict(dict)
Expand All @@ -125,6 +128,9 @@ def with_default_plugin(
if headless is not None:
plugin_options['browser_launch_options']['headless'] = headless

if use_incognito_pages is not None:
plugin_options['use_incognito_pages'] = use_incognito_pages

if browser_type:
plugin_options['browser_type'] = browser_type

Expand Down
31 changes: 24 additions & 7 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,24 @@ def __init__(
browser: Browser,
*,
max_open_pages_per_browser: int = 20,
use_incognito_pages: bool = False,
header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,
) -> None:
"""A default constructor.
Args:
browser: The browser instance to control.
max_open_pages_per_browser: The maximum number of pages that can be open at the same time.
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
own context that is destroyed once the page is closed or crashes.
header_generator: An optional `HeaderGenerator` instance used to generate and manage HTTP headers for
requests made by the browser. By default, a predefined header generator is used. Set to `None` to
disable automatic header modifications.
"""
self._browser = browser
self._max_open_pages_per_browser = max_open_pages_per_browser
self._header_generator = header_generator
self._use_incognito_pages = use_incognito_pages

self._browser_context: BrowserContext | None = None
self._pages = list[Page]()
Expand Down Expand Up @@ -115,13 +119,20 @@ async def new_page(
Raises:
ValueError: If the browser has reached the maximum number of open pages.
"""
if not self._browser_context:
self._browser_context = await self._create_browser_context(browser_new_context_options, proxy_info)

if not self.has_free_capacity:
raise ValueError('Cannot open more pages in this browser.')

page = await self._browser_context.new_page()
if self._use_incognito_pages:
# We use https://playwright.dev/python/docs/api/class-browser#browser-new-page for create a page in
# a separate context.
page_context_options = self._create_context_options(browser_new_context_options, proxy_info)
page = await self._browser.new_page(**page_context_options)
else:
# We use https://playwright.dev/python/docs/api/class-browser#browser-new-context for create context
# The page are then created in this context
if not self._browser_context:
self._browser_context = await self._create_browser_context(browser_new_context_options, proxy_info)
page = await self._browser_context.new_page()

# Handle page close event
page.on(event='close', f=self._on_page_close)
Expand Down Expand Up @@ -153,10 +164,10 @@ def _on_page_close(self, page: Page) -> None:
"""Handle actions after a page is closed."""
self._pages.remove(page)

async def _create_browser_context(
def _create_context_options(
self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None
) -> BrowserContext:
"""Create a new browser context with the specified proxy settings."""
) -> Mapping[str, Any]:
"""Create context options for context and single pages with the specified proxy settings."""
if self._header_generator:
common_headers = self._header_generator.get_common_headers()
sec_ch_ua_headers = self._header_generator.get_sec_ch_ua_headers(browser_type=self.browser_type)
Expand All @@ -179,5 +190,11 @@ async def _create_browser_context(
username=proxy_info.username,
password=proxy_info.password,
)
return browser_new_context_options

async def _create_browser_context(
self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None
) -> BrowserContext:
"""Create a new browser context with the specified proxy settings."""
browser_new_context_options = self._create_context_options(browser_new_context_options, proxy_info)
return await self._browser.new_context(**browser_new_context_options)
5 changes: 5 additions & 0 deletions src/crawlee/browsers/_playwright_browser_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(
browser_launch_options: dict[str, Any] | None = None,
browser_new_context_options: dict[str, Any] | None = None,
max_open_pages_per_browser: int = 20,
use_incognito_pages: bool = False,
) -> None:
"""A default constructor.
Expand All @@ -56,6 +57,8 @@ def __init__(
Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
max_open_pages_per_browser: The maximum number of pages that can be opened in a single browser instance.
Once reached, a new browser instance will be launched to handle the excess.
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
own context that is destroyed once the page is closed or crashes.
"""
config = service_locator.get_configuration()

Expand All @@ -70,6 +73,7 @@ def __init__(
self._browser_launch_options = default_launch_browser_options | (browser_launch_options or {})
self._browser_new_context_options = browser_new_context_options or {}
self._max_open_pages_per_browser = max_open_pages_per_browser
self._use_incognito_pages = use_incognito_pages

self._playwright_context_manager = async_playwright()
self._playwright: Playwright | None = None
Expand Down Expand Up @@ -154,5 +158,6 @@ async def new_browser(self) -> PlaywrightBrowserController:

return PlaywrightBrowserController(
browser,
use_incognito_pages=self._use_incognito_pages,
max_open_pages_per_browser=self._max_open_pages_per_browser,
)
39 changes: 36 additions & 3 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import TYPE_CHECKING, Any, Callable

from pydantic import ValidationError
from yarl import URL

from crawlee import EnqueueStrategy, RequestTransformAction
from crawlee._request import Request, RequestOptions
Expand All @@ -22,6 +23,7 @@
if TYPE_CHECKING:
from collections.abc import AsyncGenerator, Awaitable, Mapping

from playwright.async_api import Page
from typing_extensions import Unpack

from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs
Expand Down Expand Up @@ -76,6 +78,7 @@ def __init__(
browser_launch_options: Mapping[str, Any] | None = None,
browser_new_context_options: Mapping[str, Any] | None = None,
headless: bool | None = None,
use_incognito_pages: bool | None = None,
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext]],
) -> None:
"""A default constructor.
Expand All @@ -94,17 +97,27 @@ def __init__(
This option should not be used if `browser_pool` is provided.
headless: Whether to run the browser in headless mode.
This option should not be used if `browser_pool` is provided.
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
own context that is destroyed once the page is closed or crashes.
This option should not be used if `browser_pool` is provided.
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
"""
if browser_pool:
# Raise an exception if browser_pool is provided together with other browser-related arguments.
if any(
param is not None
for param in (headless, browser_type, browser_launch_options, browser_new_context_options)
for param in (
use_incognito_pages,
headless,
browser_type,
browser_launch_options,
browser_new_context_options,
)
):
raise ValueError(
'You cannot provide `headless`, `browser_type`, `browser_launch_options` or '
'`browser_new_context_options` arguments when `browser_pool` is provided.'
'You cannot provide `headless`, `browser_type`, `browser_launch_options`'
'`browser_new_context_options` or `use_incognito_pages` arguments when '
'`browser_pool` is provided.'
)

# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
Expand All @@ -114,6 +127,7 @@ def __init__(
browser_type=browser_type,
browser_launch_options=browser_launch_options,
browser_new_context_options=browser_new_context_options,
use_incognito_pages=use_incognito_pages,
)

self._browser_pool = browser_pool
Expand Down Expand Up @@ -175,6 +189,9 @@ async def _navigate(
infinite_scroll and block_requests).
"""
async with context.page:
if context.session:
await self._set_cookies(context.page, context.request.url, context.session.cookies)

if context.request.headers:
await context.page.set_extra_http_headers(context.request.headers.model_dump())
# Navigate to the URL and get response.
Expand All @@ -186,6 +203,10 @@ async def _navigate(
# Set the loaded URL to the actual URL after redirection.
context.request.loaded_url = context.page.url

if context.session:
cookies = await self._get_cookies(context.page)
context.session.cookies.update(cookies)

async def enqueue_links(
*,
selector: str = 'a',
Expand Down Expand Up @@ -295,3 +316,15 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext],
hook: A coroutine function to be called before each navigation.
"""
self._pre_navigation_hooks.append(hook)

async def _get_cookies(self, page: Page) -> dict[str, str]:
"""Get the cookies from the page."""
cookies = await page.context.cookies()
return {cookie['name']: cookie['value'] for cookie in cookies if cookie.get('name') and cookie.get('value')}

async def _set_cookies(self, page: Page, url: str, cookies: dict[str, str]) -> None:
"""Set the cookies to the page."""
parsed_url = URL(url)
await page.context.add_cookies(
[{'name': name, 'value': value, 'domain': parsed_url.host, 'path': '/'} for name, value in cookies.items()]
)
4 changes: 2 additions & 2 deletions src/crawlee/sessions/_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(
usage_count: int = 0,
max_usage_count: int = 50,
error_score: float = 0.0,
cookies: dict | None = None,
cookies: dict[str, str] | None = None,
blocked_status_codes: list | None = None,
) -> None:
"""A default constructor.
Expand Down Expand Up @@ -94,7 +94,7 @@ def user_data(self) -> dict:
return self._user_data

@property
def cookies(self) -> dict:
def cookies(self) -> dict[str, str]:
"""Get the cookies."""
return self._cookies

Expand Down
79 changes: 78 additions & 1 deletion tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
from typing import TYPE_CHECKING
from unittest import mock

from crawlee import Glob, HttpHeaders, Request, RequestTransformAction
import pytest

from crawlee import ConcurrencySettings, Glob, HttpHeaders, Request, RequestTransformAction
from crawlee._types import EnqueueStrategy
from crawlee.crawlers import PlaywrightCrawler
from crawlee.fingerprint_suite._consts import (
Expand All @@ -19,6 +21,7 @@
PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT,
)
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.sessions import SessionPool

if TYPE_CHECKING:
from yarl import URL
Expand Down Expand Up @@ -247,3 +250,77 @@ async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
await crawler.run(['https://test.com'])

assert handler_data.get('proxy') == proxy_value


@pytest.mark.parametrize(
'use_incognito_pages',
[
pytest.param(False, id='without use_incognito_pages'),
pytest.param(True, id='with use_incognito_pages'),
],
)
async def test_isolation_cookies(*, use_incognito_pages: bool, httpbin: URL) -> None:
sessions_ids: list[str] = []
sessions_cookies: dict[str, dict[str, str]] = {}
response_cookies: dict[str, dict[str, str]] = {}

crawler = PlaywrightCrawler(
session_pool=SessionPool(max_pool_size=1),
use_incognito_pages=use_incognito_pages,
concurrency_settings=ConcurrencySettings(max_concurrency=1),
)

@crawler.router.default_handler
async def handler(context: PlaywrightCrawlingContext) -> None:
if not context.session:
return

sessions_ids.append(context.session.id)

if context.request.unique_key not in {'1', '2'}:
return

sessions_cookies[context.session.id] = context.session.cookies
response_data = json.loads(await context.response.text())
response_cookies[context.session.id] = response_data.get('cookies')

if context.request.user_data.get('retire_session'):
context.session.retire()

await crawler.run(
[
# The first request sets the cookie in the session
str(httpbin.with_path('/cookies/set').extend_query(a=1)),
# With the second request, we check the cookies in the session and set retire
Request.from_url(str(httpbin.with_path('/cookies')), unique_key='1', user_data={'retire_session': True}),
Request.from_url(str(httpbin.with_path('/cookies')), unique_key='2'),
]
)

assert len(sessions_cookies) == 2
assert len(response_cookies) == 2

assert sessions_ids[0] == sessions_ids[1]

cookie_session_id = sessions_ids[0]
clean_session_id = sessions_ids[2]

assert cookie_session_id != clean_session_id

# When using `use_incognito_pages` there should be full cookie isolation
if use_incognito_pages:
# The initiated cookies must match in both the response and the session store
assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}

# For a clean session, the cookie should not be in the sesstion store or in the response
# This way we can be sure that no cookies are being leaked through the http client
assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {}
# Without `use_incognito_pages` we will have access to the session cookie,
# but there will be a cookie leak via PlaywrightContext
else:
# The initiated cookies must match in both the response and the session store
assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}

# PlaywrightContext makes cookies shared by all sessions that work with it.
# So in this case a clean session contains the same cookies
assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {'a': '1'}

0 comments on commit eae3a33

Please sign in to comment.