apify · vdusek · Jul 2, 2024 · Jul 1, 2024 · Jul 2, 2024 · Jul 2, 2024
diff --git a/docs/examples/add-data-to-dataset.md b/docs/examples/add-data-to-dataset.md
@@ -3,28 +3,115 @@ id: add-data-to-dataset
 title: Add data to dataset
 ---
 
-This example saves data to the default dataset. If the dataset doesn't exist, it will be created. You can save data to custom datasets by passing `dataset_id` or `dataset_name` to `push_data`.
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+This example demonstrates how to store extracted data into datasets using the `context.push_data()` helper function. If the specified dataset does not already exist, it will be created automatically. Additionally, you can save data to custom datasets by providing `dataset_id` or `dataset_name` parameters to the `push_data` method.
+
+<Tabs groupId="main">
+<TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
 
 ```python
+import asyncio
+
 from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 
-crawler = BeautifulSoupCrawler()
 
-# Function called for each URL
-@crawler.router.default_handler
-async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
-    await context.push_data({
-        "url": context.request.url,
-        "html": context.http_response.text(),
-    })
+async def main() -> None:
+    crawler = BeautifulSoupCrawler()
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Extract data from the page.
+        data = {
+            'url': context.request.url,
+            'title': context.soup.title.string,
+            'html': str(context.soup)[:1000],
+        }
+
+        # Push the extracted data to the default dataset.
+        await context.push_data(data)
+
+    # Run the crawler with the initial list of requests.
+    await crawler.run(
+        [
+            'https://crawlee.dev',
+            'https://apify.com',
+            'https://example.com',
+        ]
+    )
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
+```
+
+</TabItem>
+<TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
+
+```python
+import asyncio
+
+from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
+
 
 async def main() -> None:
-    # Run the crawler
-    await crawler.run([
-        'http://www.example.com/page-1',
-        'http://www.example.com/page-2',
-        'http://www.example.com/page-3',
-    ])
-
-asyncio.run(main())
+    crawler = PlaywrightCrawler()
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: PlaywrightCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Extract data from the page.
+        data = {
+            'url': context.request.url,
+            'title': await context.page.title(),
+            'html': str(await context.page.content())[:1000],
+        }
+
+        # Push the extracted data to the default dataset.
+        await context.push_data(data)
+
+    # Run the crawler with the initial list of requests.
+    await crawler.run(
+        [
+            'https://crawlee.dev',
+            'https://apify.com',
+            'https://example.com',
+        ]
+    )
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
+```
+
+</TabItem>
+</Tabs>
+
+Each item in the dataset will be stored in its own file within the following directory:
+
+```text
+{PROJECT_FOLDER}/storage/datasets/default/
+```
+
+For more control, you can also open a dataset manually using the asynchronous constructor `Dataset.open()` and interact with it directly:
+
+```python
+from crawlee.storages import Dataset
+
+# ...
+
+async def main() -> None:
+    # Open dataset manually using asynchronous constructor open().
+    dataset = await Dataset.open()
+
+    # Interact with dataset directly.
+    await dataset.push_data({'key': 'value'})
+
+# ...
 ```
diff --git a/docs/examples/beautifulsoup-crawler.md b/docs/examples/beautifulsoup-crawler.md
@@ -0,0 +1,55 @@
+---
+id: beautifulsoup-crawler
+title: BeautifulSoup crawler
+---
+
+This example demonstrates how to use `BeautifulSoupCrawler` to crawl a list of URLs, load each URL using a plain HTTP request, parse the HTML using the [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library and extract some data from it - the page title and all `<h1>`, `<h2>` and `<h3>` tags. This setup is perfect for scraping specific elements from web pages. Thanks to the well-known BeautifulSoup, you can easily navigate the HTML structure and retrieve the data you need with minimal code.
+
+```python
+import asyncio
+from datetime import timedelta
+
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+
+async def main() -> None:
+    # Create an instance of the BeautifulSoupCrawler class, a crawler that automatically
+    # loads the URLs and parses their HTML using the BeautifulSoup library.
+    crawler = BeautifulSoupCrawler(
+        # On error, retry each page at most once.
+        max_request_retries=1,
+        # Increase the timeout for processing each page to 30 seconds.
+        request_handler_timeout=timedelta(seconds=30),
+        # Limit the crawl to max requests. Remove or increase it for crawling all links.
+        max_requests_per_crawl=10,
+    )
+
+    # Define the default request handler, which will be called for every request.
+    # The handler receives a context parameter, providing various properties and
+    # helper methods. Here are a few key ones we use for demonstration:
+    # - request: an instance of the Request class containing details such as the URL
+    #   being crawled and the HTTP method used.
+    # - soup: the BeautifulSoup object containing the parsed HTML of the response.
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Extract data from the page.
+        data = {
+            'url': context.request.url,
+            'title': context.soup.title.string if context.soup.title else None,
+            'h1s': [h1.text for h1 in context.soup.find_all('h1')],
+            'h2s': [h2.text for h2 in context.soup.find_all('h2')],
+            'h3s': [h3.text for h3 in context.soup.find_all('h3')],
+        }
+
+        # Push the extracted data to the default dataset. In local configuration,
+        # the data will be stored as JSON files in ./storage/datasets/default.
+        await context.push_data(data)
+
+    # Run the crawler with the initial list of URLs.
+    await crawler.run(['https://crawlee.dev'])
+
+if __name__ == '__main__':
+    asyncio.run(main())
+```
diff --git a/docs/examples/capture-screenshot-using-playwright.md b/docs/examples/capture-screenshot-using-playwright.md
@@ -0,0 +1,60 @@
+---
+id: capture-screenshots-using-playwright
+title: Capture screenshots using Playwright
+---
+
+This example demonstrates how to capture screenshots of web pages using `PlaywrightCrawler` and store them in the key-value store.
+
+The `PlaywrightCrawler` is configured to automate the browsing and interaction with web pages. It uses headless Chromium as the browser type to perform these tasks. Each web page specified in the initial list of URLs is visited sequentially, and a screenshot of the page is captured using Playwright's `page.screenshot()` method.
+
+The captured screenshots are stored in the key-value store, which is suitable for managing and storing files in various formats. In this case, screenshots are stored as PNG images with a unique key generated from the URL of the page.
+
+```python
+import asyncio
+
+from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
+from crawlee.storages import KeyValueStore
+
+
+async def main() -> None:
+    crawler = PlaywrightCrawler(
+        # Limit the crawl to max requests. Remove or increase it for crawling all links.
+        max_requests_per_crawl=10,
+        # Headless mode, set to False to see the browser in action.
+        headless=False,
+        # Browser types supported by Playwright.
+        browser_type='chromium',
+    )
+
+    # Open the default key-value store.
+    kvs = await KeyValueStore.open()
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: PlaywrightCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Capture the screenshot of the page using Playwright's API.
+        screenshot = await context.page.screenshot()
+        name = context.request.url.split('/')[-1]
+
+        # Store the screenshot in the key-value store.
+        await kvs.set_value(
+            key=f'screenshot-{name}',
+            value=screenshot,
+            content_type='image/png',
+        )
+
+    # Run the crawler with the initial list of URLs.
+    await crawler.run(
+        [
+            'https://crawlee.dev',
+            'https://apify.com',
+            'https://example.com',
+        ]
+    )
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
+```
diff --git a/docs/examples/crawl-all-links-on-a-website.md b/docs/examples/crawl-all-links-on-a-website.md
diff --git a/docs/examples/crawl-all-links-on-website.md b/docs/examples/crawl-all-links-on-website.md
@@ -0,0 +1,80 @@
+---
+id: crawl-all-links-on-website
+title: Crawl all links on website
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+This example uses the `enqueue_links()` helper to add new links to the `RequestQueue` as the crawler navigates from page to page. By automatically discovering and enqueuing all links on a given page, the crawler can systematically scrape an entire website. This approach is ideal for web scraping tasks where you need to collect data from multiple interconnected pages.
+
+:::tip
+
+If no options are given, by default the method will only add links that are under the same subdomain. This behavior can be controlled with the `strategy` option. You can find more info about this option in the [Crawl website with relative links](./crawl-website-with-relative-links) example.
+
+:::
+
+<Tabs groupId="main">
+<TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
+
+```python
+import asyncio
+
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+
+async def main() -> None:
+    crawler = BeautifulSoupCrawler(
+        # Limit the crawl to max requests. Remove or increase it for crawling all links.
+        max_requests_per_crawl=10,
+    )
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Enqueue all links found on the page.
+        await context.enqueue_links()
+
+    # Run the crawler with the initial list of requests.
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
+```
+
+</TabItem>
+<TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
+
+```python
+import asyncio
+
+from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
+
+
+async def main() -> None:
+    crawler = PlaywrightCrawler(
+        # Limit the crawl to max requests. Remove or increase it for crawling all links.
+        max_requests_per_crawl=10,
+    )
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: PlaywrightCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Enqueue all links found on the page.
+        await context.enqueue_links()
+
+    # Run the crawler with the initial list of requests.
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
+```
+
+</TabItem>
+</Tabs>