feat: add support use_incognito_pages for browser_launch_options …

…in `PlaywrightCrawler` (#941) ### Description - Improve cookie handling for `PlaywrightCrawler`. Cookies are now stored in the `Session` and set in Playwright Context from the `Session`. - Add `use_incognito_pages` option for `browser_launch_options` allowing each new page to be launched in a separate context. ### Issues - #722 - #933
apify · Feb 5, 2025 · eae3a33 · eae3a33
1 parent 3c89827
commit eae3a33
Show file tree

Hide file tree

Showing 6 changed files with 151 additions and 13 deletions.
diff --git a/src/crawlee/browsers/_browser_pool.py b/src/crawlee/browsers/_browser_pool.py
@@ -103,6 +103,7 @@ def with_default_plugin(
         browser_launch_options: Mapping[str, Any] | None = None,
         browser_new_context_options: Mapping[str, Any] | None = None,
         headless: bool | None = None,
+        use_incognito_pages: bool | None = False,
         **kwargs: Any,
     ) -> BrowserPool:
         """Create a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
@@ -116,6 +117,8 @@ def with_default_plugin(
                 are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
                 Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
             headless: Whether to run the browser in headless mode.
+            use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
+                own context that is destroyed once the page is closed or crashes.
             kwargs: Additional arguments for default constructor.
         """
         plugin_options: dict = defaultdict(dict)
@@ -125,6 +128,9 @@ def with_default_plugin(
         if headless is not None:
             plugin_options['browser_launch_options']['headless'] = headless
 
+        if use_incognito_pages is not None:
+            plugin_options['use_incognito_pages'] = use_incognito_pages
+
         if browser_type:
             plugin_options['browser_type'] = browser_type
 

diff --git a/src/crawlee/browsers/_playwright_browser_controller.py b/src/crawlee/browsers/_playwright_browser_controller.py
@@ -41,20 +41,24 @@ def __init__(
         browser: Browser,
         *,
         max_open_pages_per_browser: int = 20,
+        use_incognito_pages: bool = False,
         header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,
     ) -> None:
         """A default constructor.
 
         Args:
             browser: The browser instance to control.
             max_open_pages_per_browser: The maximum number of pages that can be open at the same time.
+            use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
+                own context that is destroyed once the page is closed or crashes.
             header_generator: An optional `HeaderGenerator` instance used to generate and manage HTTP headers for
                 requests made by the browser. By default, a predefined header generator is used. Set to `None` to
                 disable automatic header modifications.
         """
         self._browser = browser
         self._max_open_pages_per_browser = max_open_pages_per_browser
         self._header_generator = header_generator
+        self._use_incognito_pages = use_incognito_pages
 
         self._browser_context: BrowserContext | None = None
         self._pages = list[Page]()
@@ -115,13 +119,20 @@ async def new_page(
         Raises:
             ValueError: If the browser has reached the maximum number of open pages.
         """
-        if not self._browser_context:
-            self._browser_context = await self._create_browser_context(browser_new_context_options, proxy_info)
-
         if not self.has_free_capacity:
             raise ValueError('Cannot open more pages in this browser.')
 
-        page = await self._browser_context.new_page()
+        if self._use_incognito_pages:
+            # We use https://playwright.dev/python/docs/api/class-browser#browser-new-page for create a page in
+            # a separate context.
+            page_context_options = self._create_context_options(browser_new_context_options, proxy_info)
+            page = await self._browser.new_page(**page_context_options)
+        else:
+            # We use https://playwright.dev/python/docs/api/class-browser#browser-new-context for create context
+            # The page are then created in this context
+            if not self._browser_context:
+                self._browser_context = await self._create_browser_context(browser_new_context_options, proxy_info)
+            page = await self._browser_context.new_page()
 
         # Handle page close event
         page.on(event='close', f=self._on_page_close)
@@ -153,10 +164,10 @@ def _on_page_close(self, page: Page) -> None:
         """Handle actions after a page is closed."""
         self._pages.remove(page)
 
-    async def _create_browser_context(
+    def _create_context_options(
         self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None
-    ) -> BrowserContext:
-        """Create a new browser context with the specified proxy settings."""
+    ) -> Mapping[str, Any]:
+        """Create context options for context and single pages with the specified proxy settings."""
         if self._header_generator:
             common_headers = self._header_generator.get_common_headers()
             sec_ch_ua_headers = self._header_generator.get_sec_ch_ua_headers(browser_type=self.browser_type)
@@ -179,5 +190,11 @@ async def _create_browser_context(
                 username=proxy_info.username,
                 password=proxy_info.password,
             )
+        return browser_new_context_options
 
+    async def _create_browser_context(
+        self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None
+    ) -> BrowserContext:
+        """Create a new browser context with the specified proxy settings."""
+        browser_new_context_options = self._create_context_options(browser_new_context_options, proxy_info)
         return await self._browser.new_context(**browser_new_context_options)
diff --git a/src/crawlee/browsers/_playwright_browser_plugin.py b/src/crawlee/browsers/_playwright_browser_plugin.py
@@ -43,6 +43,7 @@ def __init__(
         browser_launch_options: dict[str, Any] | None = None,
         browser_new_context_options: dict[str, Any] | None = None,
         max_open_pages_per_browser: int = 20,
+        use_incognito_pages: bool = False,
     ) -> None:
         """A default constructor.
 
@@ -56,6 +57,8 @@ def __init__(
                 Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
             max_open_pages_per_browser: The maximum number of pages that can be opened in a single browser instance.
                 Once reached, a new browser instance will be launched to handle the excess.
+            use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
+                own context that is destroyed once the page is closed or crashes.
         """
         config = service_locator.get_configuration()
 
@@ -70,6 +73,7 @@ def __init__(
         self._browser_launch_options = default_launch_browser_options | (browser_launch_options or {})
         self._browser_new_context_options = browser_new_context_options or {}
         self._max_open_pages_per_browser = max_open_pages_per_browser
+        self._use_incognito_pages = use_incognito_pages
 
         self._playwright_context_manager = async_playwright()
         self._playwright: Playwright | None = None
@@ -154,5 +158,6 @@ async def new_browser(self) -> PlaywrightBrowserController:
 
         return PlaywrightBrowserController(
             browser,
+            use_incognito_pages=self._use_incognito_pages,
             max_open_pages_per_browser=self._max_open_pages_per_browser,
         )
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING, Any, Callable
 
 from pydantic import ValidationError
+from yarl import URL
 
 from crawlee import EnqueueStrategy, RequestTransformAction
 from crawlee._request import Request, RequestOptions
@@ -22,6 +23,7 @@
 if TYPE_CHECKING:
     from collections.abc import AsyncGenerator, Awaitable, Mapping
 
+    from playwright.async_api import Page
     from typing_extensions import Unpack
 
     from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs
@@ -76,6 +78,7 @@ def __init__(
         browser_launch_options: Mapping[str, Any] | None = None,
         browser_new_context_options: Mapping[str, Any] | None = None,
         headless: bool | None = None,
+        use_incognito_pages: bool | None = None,
         **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext]],
     ) -> None:
         """A default constructor.
@@ -94,17 +97,27 @@ def __init__(
                 This option should not be used if `browser_pool` is provided.
             headless: Whether to run the browser in headless mode.
                 This option should not be used if `browser_pool` is provided.
+            use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
+                own context that is destroyed once the page is closed or crashes.
+                This option should not be used if `browser_pool` is provided.
             kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
         """
         if browser_pool:
             # Raise an exception if browser_pool is provided together with other browser-related arguments.
             if any(
                 param is not None
-                for param in (headless, browser_type, browser_launch_options, browser_new_context_options)
+                for param in (
+                    use_incognito_pages,
+                    headless,
+                    browser_type,
+                    browser_launch_options,
+                    browser_new_context_options,
+                )
             ):
                 raise ValueError(
-                    'You cannot provide `headless`, `browser_type`, `browser_launch_options` or '
-                    '`browser_new_context_options` arguments when `browser_pool` is provided.'
+                    'You cannot provide `headless`, `browser_type`, `browser_launch_options`'
+                    '`browser_new_context_options` or `use_incognito_pages` arguments when '
+                    '`browser_pool` is provided.'
                 )
 
         # If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
@@ -114,6 +127,7 @@ def __init__(
                 browser_type=browser_type,
                 browser_launch_options=browser_launch_options,
                 browser_new_context_options=browser_new_context_options,
+                use_incognito_pages=use_incognito_pages,
             )
 
         self._browser_pool = browser_pool
@@ -175,6 +189,9 @@ async def _navigate(
                 infinite_scroll and block_requests).
         """
         async with context.page:
+            if context.session:
+                await self._set_cookies(context.page, context.request.url, context.session.cookies)
+
             if context.request.headers:
                 await context.page.set_extra_http_headers(context.request.headers.model_dump())
             # Navigate to the URL and get response.
@@ -186,6 +203,10 @@ async def _navigate(
             # Set the loaded URL to the actual URL after redirection.
             context.request.loaded_url = context.page.url
 
+            if context.session:
+                cookies = await self._get_cookies(context.page)
+                context.session.cookies.update(cookies)
+
             async def enqueue_links(
                 *,
                 selector: str = 'a',
@@ -295,3 +316,15 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext],
             hook: A coroutine function to be called before each navigation.
         """
         self._pre_navigation_hooks.append(hook)
+
+    async def _get_cookies(self, page: Page) -> dict[str, str]:
+        """Get the cookies from the page."""
+        cookies = await page.context.cookies()
+        return {cookie['name']: cookie['value'] for cookie in cookies if cookie.get('name') and cookie.get('value')}
+
+    async def _set_cookies(self, page: Page, url: str, cookies: dict[str, str]) -> None:
+        """Set the cookies to the page."""
+        parsed_url = URL(url)
+        await page.context.add_cookies(
+            [{'name': name, 'value': value, 'domain': parsed_url.host, 'path': '/'} for name, value in cookies.items()]
+        )
diff --git a/src/crawlee/sessions/_session.py b/src/crawlee/sessions/_session.py
@@ -38,7 +38,7 @@ def __init__(
         usage_count: int = 0,
         max_usage_count: int = 50,
         error_score: float = 0.0,
-        cookies: dict | None = None,
+        cookies: dict[str, str] | None = None,
         blocked_status_codes: list | None = None,
     ) -> None:
         """A default constructor.
@@ -94,7 +94,7 @@ def user_data(self) -> dict:
         return self._user_data
 
     @property
-    def cookies(self) -> dict:
+    def cookies(self) -> dict[str, str]:
         """Get the cookies."""
         return self._cookies
 

diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
@@ -8,7 +8,9 @@
 from typing import TYPE_CHECKING
 from unittest import mock
 
-from crawlee import Glob, HttpHeaders, Request, RequestTransformAction
+import pytest
+
+from crawlee import ConcurrencySettings, Glob, HttpHeaders, Request, RequestTransformAction
 from crawlee._types import EnqueueStrategy
 from crawlee.crawlers import PlaywrightCrawler
 from crawlee.fingerprint_suite._consts import (
@@ -19,6 +21,7 @@
     PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT,
 )
 from crawlee.proxy_configuration import ProxyConfiguration
+from crawlee.sessions import SessionPool
 
 if TYPE_CHECKING:
     from yarl import URL
@@ -247,3 +250,77 @@ async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
     await crawler.run(['https://test.com'])
 
     assert handler_data.get('proxy') == proxy_value
+
+
+@pytest.mark.parametrize(
+    'use_incognito_pages',
+    [
+        pytest.param(False, id='without use_incognito_pages'),
+        pytest.param(True, id='with use_incognito_pages'),
+    ],
+)
+async def test_isolation_cookies(*, use_incognito_pages: bool, httpbin: URL) -> None:
+    sessions_ids: list[str] = []
+    sessions_cookies: dict[str, dict[str, str]] = {}
+    response_cookies: dict[str, dict[str, str]] = {}
+
+    crawler = PlaywrightCrawler(
+        session_pool=SessionPool(max_pool_size=1),
+        use_incognito_pages=use_incognito_pages,
+        concurrency_settings=ConcurrencySettings(max_concurrency=1),
+    )
+
+    @crawler.router.default_handler
+    async def handler(context: PlaywrightCrawlingContext) -> None:
+        if not context.session:
+            return
+
+        sessions_ids.append(context.session.id)
+
+        if context.request.unique_key not in {'1', '2'}:
+            return
+
+        sessions_cookies[context.session.id] = context.session.cookies
+        response_data = json.loads(await context.response.text())
+        response_cookies[context.session.id] = response_data.get('cookies')
+
+        if context.request.user_data.get('retire_session'):
+            context.session.retire()
+
+    await crawler.run(
+        [
+            # The first request sets the cookie in the session
+            str(httpbin.with_path('/cookies/set').extend_query(a=1)),
+            # With the second request, we check the cookies in the session and set retire
+            Request.from_url(str(httpbin.with_path('/cookies')), unique_key='1', user_data={'retire_session': True}),
+            Request.from_url(str(httpbin.with_path('/cookies')), unique_key='2'),
+        ]
+    )
+
+    assert len(sessions_cookies) == 2
+    assert len(response_cookies) == 2
+
+    assert sessions_ids[0] == sessions_ids[1]
+
+    cookie_session_id = sessions_ids[0]
+    clean_session_id = sessions_ids[2]
+
+    assert cookie_session_id != clean_session_id
+
+    # When using `use_incognito_pages` there should be full cookie isolation
+    if use_incognito_pages:
+        # The initiated cookies must match in both the response and the session store
+        assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}
+
+        # For a clean session, the cookie should not be in the sesstion store or in the response
+        # This way we can be sure that no cookies are being leaked through the http client
+        assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {}
+    # Without `use_incognito_pages` we will have access to the session cookie,
+    # but there will be a cookie leak via PlaywrightContext
+    else:
+        # The initiated cookies must match in both the response and the session store
+        assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}
+
+        # PlaywrightContext makes cookies shared by all sessions that work with it.
+        # So in this case a clean session contains the same cookies
+        assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {'a': '1'}