diff --git a/docs/guides/code_examples/error_handling/change_handle_error_status.py b/docs/guides/code_examples/error_handling/change_handle_error_status.py new file mode 100644 index 0000000000..3b721545b2 --- /dev/null +++ b/docs/guides/code_examples/error_handling/change_handle_error_status.py @@ -0,0 +1,47 @@ +import asyncio +import json + +from crawlee import HttpHeaders +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext +from crawlee.errors import HttpStatusCodeError +from crawlee.sessions import SessionPool + +# Using a placeholder refresh token for this example +REFRESH_TOKEN = 'PLACEHOLDER' +UNAUTHORIZED_CODE = 401 + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=2, + # Only treat 403 as a blocking status code, not 401 + session_pool=SessionPool(create_session_settings={'blocked_status_codes': [403]}), + # Don't treat 401 responses as errors + ignore_http_error_status_codes=[UNAUTHORIZED_CODE], + ) + + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Now we can handle 401 responses ourselves + if context.http_response.status_code == UNAUTHORIZED_CODE: + # Get a fresh access token + headers = {'authorization': f'Bearer {REFRESH_TOKEN}'} + response = await context.send_request( + 'https://placeholder.org/refresh', headers=headers + ) + data = json.loads(response.read()) + # Add the new token to our `Request` headers + new_headers = { + **context.request.headers, + 'authorization': f'Bearer {data["access_token"]}', + } + context.request.headers = HttpHeaders(new_headers) + # Trigger a retry with our updated headers + raise HttpStatusCodeError('Unauthorized', status_code=UNAUTHORIZED_CODE) + + await crawler.run(['http://httpbingo.org/status/401']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/error_handling/disable_retry.py b/docs/guides/code_examples/error_handling/disable_retry.py new file mode 100644 index 0000000000..8d98eff312 --- /dev/null +++ b/docs/guides/code_examples/error_handling/disable_retry.py @@ -0,0 +1,30 @@ +import asyncio + +from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext +from crawlee.errors import HttpStatusCodeError, SessionError + + +async def main() -> None: + crawler = HttpCrawler(max_request_retries=5) + + # Create a parsing error for demonstration + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + raise ValueError('Simulated parsing error') + + # This handler runs before any retry attempts + @crawler.error_handler + async def retry_handler(context: BasicCrawlingContext, error: Exception) -> None: + context.log.error(f'Failed request {context.request.url}') + # Only allow retries for network-related errors + if not isinstance(error, (SessionError, HttpStatusCodeError)): + context.log.error('Non-network error detected') + # Stop further retry attempts for this `Request` + context.request.no_retry = True + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/error_handling/handle_proxy_error.py b/docs/guides/code_examples/error_handling/handle_proxy_error.py new file mode 100644 index 0000000000..eddb843fdd --- /dev/null +++ b/docs/guides/code_examples/error_handling/handle_proxy_error.py @@ -0,0 +1,40 @@ +import asyncio + +from crawlee import Request +from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext +from crawlee.errors import ProxyError + + +async def main() -> None: + # Set how many session rotations will happen before calling the error handler + # when ProxyError occurs + crawler = HttpCrawler(max_session_rotations=5, max_request_retries=6) + + # For this example, we'll create a proxy error in our handler + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + raise ProxyError('Simulated proxy error') + + # This handler runs after all retry attempts are exhausted + @crawler.failed_request_handler + async def failed_handler(context: BasicCrawlingContext, error: Exception) -> None: + context.log.error(f'Failed request {context.request.url}, after 5 rotations') + request = context.request + # For proxy errors, we can add a new `Request` to try again + if isinstance(error, ProxyError) and not request.unique_key.startswith('retry'): + context.log.info(f'Retrying {request.url} ...') + # Create a new `Request` with a modified key to avoid deduplication + new_request = Request.from_url( + request.url, unique_key=f'retry{request.unique_key}' + ) + + # Add the new `Request` to the `Queue` + rq = await crawler.get_request_manager() + await rq.add_request(new_request) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/error_handling.mdx b/docs/guides/error_handling.mdx new file mode 100644 index 0000000000..abd1b33058 --- /dev/null +++ b/docs/guides/error_handling.mdx @@ -0,0 +1,44 @@ +--- +id: error-handling +title: Error handling +description: How to handle errors that occur during web crawling. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import HandleProxyError from '!!raw-loader!roa-loader!./code_examples/error_handling/handle_proxy_error.py'; +import ChangeHandleErrorStatus from '!!raw-loader!roa-loader!./code_examples/error_handling/change_handle_error_status.py'; +import DisableRetry from '!!raw-loader!roa-loader!./code_examples/error_handling/disable_retry.py'; + +This guide demonstrates techniques for handling common errors encountered during web crawling operations. + +## Handling proxy errors + +Low-quality proxies can cause problems even with high settings for `max_request_retries` and `max_session_rotations` in <ApiLink to="class/BasicCrawlerOptions">`BasicCrawlerOptions`</ApiLink>. If you can't get data because of proxy errors, you might want to try again. You can do this using <ApiLink to="class/BasicCrawler#failed_request_handler">`failed_request_handler`</ApiLink>: + +<RunnableCodeBlock className="language-python" language="python"> + {HandleProxyError} +</RunnableCodeBlock> + +You can use this same approach when testing different proxy providers. To better manage this process, you can count proxy errors and [stop the crawler](../examples/crawler-stop) if you get too many. + +## Changing how error status codes are handled + +By default, when <ApiLink to="class/Session">`Sessions`</ApiLink> get status codes like [401](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/401), [403](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/403), or [429](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/429), Crawlee marks the <ApiLink to="class/Session">`Session`</ApiLink> as `retire` and switches to a new one. This might not be what you want, especially when working with [authentication](./logging-in-with-a-crawler). You can learn more in the [Session management guide](./session-management). + +Here's an example of how to change this behavior: + +<RunnableCodeBlock className="language-python" language="python"> + {ChangeHandleErrorStatus} +</RunnableCodeBlock> + +## Turning off retries for non-network errors + +Sometimes you might get unexpected errors when parsing data, like when a website has an unusual structure. Crawlee normally tries again based on your `max_request_retries` setting, but sometimes you don't want that. + +Here's how to turn off retries for non-network errors using <ApiLink to="class/BasicCrawler#error_handler">`error_handler`</ApiLink>, which runs before Crawlee tries again: + +<RunnableCodeBlock className="language-python" language="python"> + {DisableRetry} +</RunnableCodeBlock>