Skip to content

Commit

Permalink
feat: Add pre_navigation_hooks to PlaywrightCrawler (#631)
Browse files Browse the repository at this point in the history
### Description

<!-- The purpose of the PR, list of the changes, ... -->

Add a new decorator for processing pre navigation hooks

Example Use:
```python
from crawlee.playwright_crawler import PlaywrightCrawler
from .routes import router

async def main() -> None:
    """The crawler entry point."""
    crawler = PlaywrightCrawler(
        request_handler=router,
        max_requests_per_crawl=50,
    )

    @crawler.pre_navigation_hook
    async def hooky(context) -> None:
        print(f'Hook1')

    @crawler.pre_navigation_hook
    async def hooky2(context) -> None:
        print(f'Hook2')

    await crawler.run(
        [
            'https://crawlee.dev',
        ]
    )
```

### Issues

<!-- If applicable, reference any related GitHub issues -->

- Closes: #427

### Checklist

- [x] CI passed
  • Loading branch information
Prathamesh010 authored Oct 30, 2024
1 parent f9463e7 commit 5dd5b60
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 24 deletions.
10 changes: 9 additions & 1 deletion docs/examples/code/playwright_crawler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import asyncio

from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavigationContext


async def main() -> None:
Expand Down Expand Up @@ -47,6 +47,14 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
# Find a link to the next page and enqueue it if it exists.
await context.enqueue_links(selector='.morelink')

# Define a hook that will be called each time before navigating to a new URL.
# The hook receives a context parameter, providing access to the request and
# browser page among other things. In this example, we log the URL being
# navigated to.
@crawler.pre_navigation_hook
async def log_navigation_url(context: PlaywrightPreNavigationContext) -> None:
context.log.info(f'Navigating to {context.request.url} ...')

# Run the crawler with the initial list of URLs.
await crawler.run(['https://news.ycombinator.com/'])

Expand Down
2 changes: 2 additions & 0 deletions docs/examples/playwright_crawler.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ This example demonstrates how to use <ApiLink to="class/PlaywrightCrawler">`Play

The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> manages the browser and page instances, simplifying the process of interacting with web pages. In the request handler, Playwright's API is used to extract data from each post on the page. Specifically, it retrieves the title, rank, and URL of each post. Additionally, the handler enqueues links to the next pages to ensure continuous scraping. This setup is ideal for scraping dynamic web pages where JavaScript execution is required to render the content.

A **pre-navigation hook** can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation.

<CodeBlock className="language-python">
{PlaywrightCrawlerExample}
</CodeBlock>
3 changes: 2 additions & 1 deletion src/crawlee/playwright_crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
try:
from ._playwright_crawler import PlaywrightCrawler
from ._playwright_crawling_context import PlaywrightCrawlingContext
from ._playwright_pre_navigation_context import PlaywrightPreNavigationContext
except ImportError as exc:
raise ImportError(
"To import anything from this subpackage, you need to install the 'playwright' extra."
"For example, if you use pip, run `pip install 'crawlee[playwright]'`.",
) from exc

__all__ = ['PlaywrightCrawler', 'PlaywrightCrawlingContext']
__all__ = ['PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPreNavigationContext']
60 changes: 44 additions & 16 deletions src/crawlee/playwright_crawler/_playwright_crawler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Awaitable, Callable

from pydantic import ValidationError
from typing_extensions import Unpack
Expand All @@ -14,6 +14,7 @@
from crawlee.browsers import BrowserPool
from crawlee.errors import SessionError
from crawlee.playwright_crawler._playwright_crawling_context import PlaywrightCrawlingContext
from crawlee.playwright_crawler._playwright_pre_navigation_context import PlaywrightPreNavigationContext
from crawlee.playwright_crawler._utils import infinite_scroll

if TYPE_CHECKING:
Expand Down Expand Up @@ -95,16 +96,41 @@ def __init__(

# Compose the context pipeline with the Playwright-specific context enhancer.
kwargs['_context_pipeline'] = (
ContextPipeline().compose(self._make_http_request).compose(self._handle_blocked_request)
ContextPipeline().compose(self._open_page).compose(self._navigate).compose(self._handle_blocked_request)
)
kwargs['_additional_context_managers'] = [self._browser_pool]
kwargs.setdefault('_logger', logging.getLogger(__name__))
self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavigationContext], Awaitable[None]]] = []

super().__init__(**kwargs)

async def _make_http_request(
async def _open_page(self, context: BasicCrawlingContext) -> AsyncGenerator[PlaywrightPreNavigationContext, None]:
if self._browser_pool is None:
raise ValueError('Browser pool is not initialized.')

# Create a new browser page
crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)

pre_navigation_context = PlaywrightPreNavigationContext(
request=context.request,
session=context.session,
add_requests=context.add_requests,
send_request=context.send_request,
push_data=context.push_data,
proxy_info=context.proxy_info,
get_key_value_store=context.get_key_value_store,
log=context.log,
page=crawlee_page.page,
)

for hook in self._pre_navigation_hooks:
await hook(pre_navigation_context)

yield pre_navigation_context

async def _navigate(
self,
context: BasicCrawlingContext,
context: PlaywrightPreNavigationContext,
) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
"""Executes an HTTP request utilizing the `BrowserPool` and the `Playwright` library.
Expand All @@ -119,21 +145,15 @@ async def _make_http_request(
The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links, and
infinite_scroll).
"""
if self._browser_pool is None:
raise ValueError('Browser pool is not initialized.')

# Create a new browser page
crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)

async with crawlee_page.page:
async with context.page:
# Navigate to the URL and get response.
response = await crawlee_page.page.goto(context.request.url)
response = await context.page.goto(context.request.url)

if response is None:
raise SessionError(f'Failed to load the URL: {context.request.url}')

# Set the loaded URL to the actual URL after redirection.
context.request.loaded_url = crawlee_page.page.url
context.request.loaded_url = context.page.url

async def enqueue_links(
*,
Expand All @@ -148,7 +168,7 @@ async def enqueue_links(
requests = list[BaseRequestData]()
user_data = user_data or {}

elements = await crawlee_page.page.query_selector_all(selector)
elements = await context.page.query_selector_all(selector)

for element in elements:
url = await element.get_attribute('href')
Expand Down Expand Up @@ -187,8 +207,8 @@ async def enqueue_links(
proxy_info=context.proxy_info,
get_key_value_store=context.get_key_value_store,
log=context.log,
page=crawlee_page.page,
infinite_scroll=lambda: infinite_scroll(crawlee_page.page),
page=context.page,
infinite_scroll=lambda: infinite_scroll(context.page),
response=response,
enqueue_links=enqueue_links,
)
Expand Down Expand Up @@ -227,3 +247,11 @@ async def _handle_blocked_request(
)

yield context

def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavigationContext], Awaitable[None]]) -> None:
"""Register a hook to be called before each navigation.
Args:
hook: A coroutine function to be called before each navigation.
"""
self._pre_navigation_hooks.append(hook)
11 changes: 5 additions & 6 deletions src/crawlee/playwright_crawler/_playwright_crawling_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,21 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Awaitable, Callable

from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction
from crawlee.playwright_crawler._playwright_pre_navigation_context import PlaywrightPreNavigationContext

if TYPE_CHECKING:
from playwright.async_api import Page, Response
from playwright.async_api import Response

from crawlee._types import EnqueueLinksFunction


@dataclass(frozen=True)
class PlaywrightCrawlingContext(BasicCrawlingContext):
class PlaywrightCrawlingContext(PlaywrightPreNavigationContext):
"""The crawling context used by the `PlaywrightCrawler`.
It provides access to key objects as well as utility functions for handling crawling tasks.
"""

page: Page
"""The Playwright `Page` object for the current page."""

response: Response
"""The Playwright `Response` object containing the response details for the current URL."""

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

from crawlee._types import BasicCrawlingContext

if TYPE_CHECKING:
from playwright.async_api import Page


@dataclass(frozen=True)
class PlaywrightPreNavigationContext(BasicCrawlingContext):
"""Context used by PlaywrightCrawler.
It Provides access to the `Page` object for the current browser page.
"""

page: Page
"""The Playwright `Page` object for the current page."""
15 changes: 15 additions & 0 deletions tests/unit/playwright_crawler/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,18 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
assert 'headless' not in headers['User-Agent'].lower()

assert headers['User-Agent'] == PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT


async def test_pre_navigation_hook() -> None:
crawler = PlaywrightCrawler()
mock_hook = mock.AsyncMock(return_value=None)

crawler.pre_navigation_hook(mock_hook)

@crawler.router.default_handler
async def request_handler(_context: PlaywrightCrawlingContext) -> None:
pass

await crawler.run(['https://example.com', 'https://httpbin.org'])

assert mock_hook.call_count == 2

0 comments on commit 5dd5b60

Please sign in to comment.