Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions docs/examples/code_examples/run_parallel_crawlers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import asyncio

from crawlee import ConcurrencySettings
from crawlee.crawlers import (
ParselCrawler,
ParselCrawlingContext,
PlaywrightCrawler,
PlaywrightCrawlingContext,
)
from crawlee.sessions import SessionPool
from crawlee.storages import RequestQueue


async def main() -> None:
# Open request queues for both crawlers with different aliases
playwright_rq = await RequestQueue.open(alias='playwright-requests')
parsel_rq = await RequestQueue.open(alias='parsel-requests')

# Use a shared session pool between both crawlers
async with SessionPool() as session_pool:
playwright_crawler = PlaywrightCrawler(
# Set the request queue for Playwright crawler
request_manager=playwright_rq,
session_pool=session_pool,
# Configure concurrency settings for Playwright crawler
concurrency_settings=ConcurrencySettings(
max_concurrency=5, desired_concurrency=5
),
# Set `keep_alive`` so that the crawler does not stop working when there are
# no requests in the queue.
keep_alive=True,
)

parsel_crawler = ParselCrawler(
# Set the request queue for Parsel crawler
request_manager=parsel_rq,
session_pool=session_pool,
# Configure concurrency settings for Parsel crawler
concurrency_settings=ConcurrencySettings(
max_concurrency=10, desired_concurrency=10
),
# Set maximum requests per crawl for Parsel crawler
max_requests_per_crawl=50,
)

@playwright_crawler.router.default_handler
async def handle_playwright(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Playwright Processing {context.request.url}...')

title = await context.page.title()
# Push the extracted data to the dataset for Playwright crawler
await context.push_data(
{'title': title, 'url': context.request.url, 'source': 'playwright'},
dataset_name='playwright-data',
)

@parsel_crawler.router.default_handler
async def handle_parsel(context: ParselCrawlingContext) -> None:
context.log.info(f'Parsel Processing {context.request.url}...')

title = context.parsed_content.css('title::text').get()
# Push the extracted data to the dataset for Parsel crawler
await context.push_data(
{'title': title, 'url': context.request.url, 'source': 'parsel'},
dataset_name='parsel-data',
)

# Enqueue links to the Playwright request queue for blog pages
await context.enqueue_links(
selector='a[href*="/blog/"]', rq_alias='playwright-requests'
)
# Enqueue other links to the Parsel request queue
await context.enqueue_links(selector='a:not([href*="/blog/"])')

# Start the Playwright crawler in the background
background_crawler_task = asyncio.create_task(playwright_crawler.run([]))

# Run the Parsel crawler with the initial URL and wait for it to finish
await parsel_crawler.run(['https://crawlee.dev/blog'])

# Wait for the Playwright crawler to finish processing all requests
while not await playwright_rq.is_empty():
playwright_crawler.log.info('Waiting for Playwright crawler to finish...')
await asyncio.sleep(5)

# Stop the Playwright crawler after all requests are processed
playwright_crawler.stop()

# Wait for the background Playwright crawler task to complete
await background_crawler_task


if __name__ == '__main__':
asyncio.run(main())
19 changes: 19 additions & 0 deletions docs/examples/run_parallel_crawlers.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
id: run-parallel-crawlers
title: Run parallel crawlers
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import RunParallelCrawlersExample from '!!raw-loader!roa-loader!./code_examples/run_parallel_crawlers.py';

This example demonstrates how to run two parallel crawlers where one crawler processes links discovered by another crawler.

In some situations, you may need different approaches for scraping data from a website. For example, you might use <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> for navigating JavaScript-heavy pages and a faster, more lightweight <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> for processing static pages. One way to solve this is to use <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink>, see the [Adaptive Playwright crawler example](./adaptive-playwright-crawler) to learn more.

The code below demonstrates an alternative approach using two separate crawlers. Links are passed between crawlers via <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> aliases. The `keep_alive` option allows the Playwright crawler to run in the background and wait for incoming links without stopping when its queue is empty. You can also use different storage clients for each crawler without losing the ability to pass links between queues. Learn more about available storage clients in this [guide](/python/docs/guides/storage-clients).

<RunnableCodeBlock className="language-python" language="python">
{RunParallelCrawlersExample}
</RunnableCodeBlock>
Loading