feat: add new curl impersonate HTTP client (#387)

### Description - Add a new curl impersonate HTTP client utilizing the [curl-cffi](https://pypi.org/project/curl-cffi/) package. - Improve API docs of the HTTP clients and define public & private interfaces. - I encountered a few bugs or "not great behaviour" of `curl-cffi`, opening issues: - lexiforest/curl_cffi#360 - lexiforest/curl_cffi#361 - Because of the above bugs, I decided not to set curl impersonate client as default and stay with HTTPX client. - I also had to move some general components from the `basic_crawler` module to the root of the package. Maybe it's not good, so I am open to other options on how to sort it out. ### Issues - Closes: #292 ### Testing - New unit tests were written. - Or check the example below utilizing curl impersonate client with BeautifulSoup crawler. ```python import asyncio from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.http_clients import CurlImpersonateHttpClient from crawlee.proxy_configuration import ProxyConfiguration async def main() -> None: proxy_configuration = ProxyConfiguration( proxy_urls=[ 'http://username:password@proxy.apify.com:8000', ], ) crawler = BeautifulSoupCrawler( max_requests_per_crawl=10, proxy_configuration=proxy_configuration, http_client=CurlImpersonateHttpClient(), ) @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'Processing {context.request.url}...') await context.enqueue_links() data = { 'url': context.request.url, 'title': context.soup.title.text if context.soup.title else '', } context.log.info(f'Extracted data: {data}') await context.push_data(data) await crawler.run(['https://apify.com', 'https://crawlee.dev/']) crawler.log.info('Finished crawling.') if __name__ == '__main__': asyncio.run(main()) ``` ### TODO - [ ] Before merging add better documentation of HTTP clients, an option of switching them, and implementing a new one. ### Checklist - [x] CI passed
apify · Aug 5, 2024 · 9c06260 · 9c06260
1 parent 3b2df6c
commit 9c06260
Show file tree

Hide file tree

Showing 36 changed files with 936 additions and 526 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -50,6 +50,7 @@ aioshutil = "^1.3"
 beautifulsoup4 = { version = "^4.12.3", optional = true }
 colorama = "^0.4.6"
 cookiecutter = "^2.6.0"
+curl-cffi = { version = "^0.7.1", optional = true }
 docutils = "^0.21.0"
 eval-type-backport = "^0.2.0"
 html5lib = { version = "^1.1", optional = true }
@@ -93,6 +94,7 @@ types-python-dateutil = "~2.9.0.20240316"
 
 [tool.poetry.extras]
 beautifulsoup = ["beautifulsoup4", "lxml", "html5lib"]
+curl-impersonate = ["curl-cffi"]
 playwright = ["playwright"]
 
 [tool.poetry.scripts]

diff --git a/src/crawlee/_utils/urls.py b/src/crawlee/_utils/urls.py
@@ -28,7 +28,7 @@ def validate_http_url(value: str | None) -> str | None:
  """Validate the given HTTP URL.
 
  Raises:
- pydantic.error_wrappers.ValidationError: If the URL is not valid.
+ pydantic.ValidationError: If the URL is not valid.
  """
  if value is not None:
  _http_url_adapter.validate_python(value)

diff --git a/src/crawlee/basic_crawler/__init__.py b/src/crawlee/basic_crawler/__init__.py
@@ -1,6 +1,5 @@
 from .basic_crawler import BasicCrawler, BasicCrawlerOptions
 from .context_pipeline import ContextPipeline
 from .router import Router
-from .types import BasicCrawlingContext
 
-__all__ = ['BasicCrawler', 'BasicCrawlerOptions', 'ContextPipeline', 'Router', 'BasicCrawlingContext']
+__all__ = ['BasicCrawler', 'BasicCrawlerOptions', 'ContextPipeline', 'Router']
diff --git a/src/crawlee/basic_crawler/basic_crawler.py b/src/crawlee/basic_crawler/basic_crawler.py
@@ -25,24 +25,24 @@
 from crawlee.autoscaling.snapshotter import Snapshotter
 from crawlee.autoscaling.system_status import SystemStatus
 from crawlee.basic_crawler.context_pipeline import ContextPipeline
-from crawlee.basic_crawler.errors import (
+from crawlee.basic_crawler.router import Router
+from crawlee.configuration import Configuration
+from crawlee.enqueue_strategy import EnqueueStrategy
+from crawlee.errors import (
  ContextPipelineInitializationError,
  ContextPipelineInterruptedError,
  RequestHandlerError,
  SessionError,
  UserDefinedErrorHandlerError,
 )
-from crawlee.basic_crawler.router import Router
-from crawlee.basic_crawler.types import BasicCrawlingContext, RequestHandlerRunResult, SendRequestFunction
-from crawlee.configuration import Configuration
-from crawlee.enqueue_strategy import EnqueueStrategy
 from crawlee.events import LocalEventManager
-from crawlee.http_clients import HttpxClient
+from crawlee.http_clients import HttpxHttpClient
 from crawlee.log_config import CrawleeLogFormatter
 from crawlee.models import BaseRequestData, DatasetItemsListPage, Request, RequestState
 from crawlee.sessions import SessionPool
 from crawlee.statistics import Statistics
 from crawlee.storages import Dataset, KeyValueStore, RequestQueue
+from crawlee.types import BasicCrawlingContext, HttpHeaders, RequestHandlerRunResult, SendRequestFunction
 
 if TYPE_CHECKING:
  import re
@@ -53,7 +53,7 @@
  from crawlee.statistics import FinalStatistics, StatisticsState
  from crawlee.storages.dataset import GetDataKwargs, PushDataKwargs
  from crawlee.storages.request_provider import RequestProvider
- from crawlee.types import JSONSerializable
+ from crawlee.types import HttpMethod, JSONSerializable
 
 TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
 ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Union[Request, None]]]
@@ -152,7 +152,7 @@ def __init__(
  self._router = None
  self.router.default_handler(request_handler)
 
- self._http_client = http_client or HttpxClient()
+ self._http_client = http_client or HttpxHttpClient()
 
  self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
 
@@ -683,13 +683,13 @@ def _prepare_send_request_function(
  async def send_request(
  url: str,
  *,
- method: str = 'get',
- headers: dict[str, str] | None = None,
+ method: HttpMethod = 'GET',
+ headers: HttpHeaders | None = None,
  ) -> HttpResponse:
  return await self._http_client.send_request(
- url,
+ url=url,
  method=method,
- headers=headers or {},
+ headers=headers,
  session=session,
  proxy_info=proxy_info,
  )

diff --git a/src/crawlee/basic_crawler/context_pipeline.py b/src/crawlee/basic_crawler/context_pipeline.py
@@ -4,14 +4,14 @@
 
 from typing_extensions import TypeVar
 
-from crawlee.basic_crawler.errors import (
+from crawlee.errors import (
  ContextPipelineFinalizationError,
  ContextPipelineInitializationError,
  ContextPipelineInterruptedError,
  RequestHandlerError,
  SessionError,
 )
-from crawlee.basic_crawler.types import BasicCrawlingContext
+from crawlee.types import BasicCrawlingContext
 
 TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
 TMiddlewareCrawlingContext = TypeVar('TMiddlewareCrawlingContext', bound=BasicCrawlingContext)

diff --git a/src/crawlee/basic_crawler/router.py b/src/crawlee/basic_crawler/router.py
@@ -2,7 +2,7 @@
 
 from typing import Awaitable, Callable, Generic, TypeVar
 
-from crawlee.basic_crawler.types import BasicCrawlingContext
+from crawlee.types import BasicCrawlingContext
 
 TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
 RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]

diff --git a/src/crawlee/basic_crawler/types.py b/src/crawlee/basic_crawler/types.py
diff --git a/src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py b/src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py
@@ -10,15 +10,15 @@
 from crawlee._utils.blocked import RETRY_CSS_SELECTORS
 from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
 from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline
-from crawlee.basic_crawler.errors import SessionError
 from crawlee.beautifulsoup_crawler.types import BeautifulSoupCrawlingContext
 from crawlee.enqueue_strategy import EnqueueStrategy
-from crawlee.http_clients import HttpxClient
+from crawlee.errors import SessionError
+from crawlee.http_clients import HttpxHttpClient
 from crawlee.http_crawler import HttpCrawlingContext
 from crawlee.models import BaseRequestData
 
 if TYPE_CHECKING:
- from crawlee.basic_crawler.types import AddRequestsKwargs, BasicCrawlingContext
+ from crawlee.types import AddRequestsKwargs, BasicCrawlingContext
 
 
 class BeautifulSoupCrawler(BasicCrawler[BeautifulSoupCrawlingContext]):
@@ -55,7 +55,7 @@ def __init__(
 
  kwargs.setdefault(
  'http_client',
- HttpxClient(
+ HttpxHttpClient(
  additional_http_error_status_codes=additional_http_error_status_codes,
  ignore_http_error_status_codes=ignore_http_error_status_codes,
  ),
@@ -67,10 +67,10 @@ def __init__(
 
  async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
  result = await self._http_client.crawl(
- context.request,
- context.session,
- context.proxy_info,
- self._statistics,
+ request=context.request,
+ session=context.session,
+ proxy_info=context.proxy_info,
+ statistics=self._statistics,
  )
 
  yield HttpCrawlingContext(

diff --git a/src/crawlee/beautifulsoup_crawler/types.py b/src/crawlee/beautifulsoup_crawler/types.py
@@ -3,8 +3,8 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
-from crawlee.basic_crawler.types import BasicCrawlingContext, EnqueueLinksFunction
 from crawlee.http_crawler import HttpCrawlingResult
+from crawlee.types import BasicCrawlingContext, EnqueueLinksFunction
 
 if TYPE_CHECKING:
  from bs4 import BeautifulSoup

diff --git a/src/crawlee/basic_crawler/errors.py → src/crawlee/errors.py b/src/crawlee/basic_crawler/errors.py → src/crawlee/errors.py
@@ -4,16 +4,15 @@
 
 from typing_extensions import TypeVar
 
-from crawlee.basic_crawler.types import BasicCrawlingContext
+from crawlee.types import BasicCrawlingContext
+
+TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
 
 
 class UserDefinedErrorHandlerError(Exception):
  """Wraps an exception thrown from an user-defined error handler."""
 
 
-TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
-
-
 class SessionError(Exception):
  """Errors of `SessionError` type will trigger a session rotation.
 
@@ -25,6 +24,10 @@ class ProxyError(SessionError):
  """Raised when a proxy is being blocked or malfunctions."""
 
 
+class HttpStatusCodeError(Exception):
+ """Raised when the response status code indicates an error."""
+
+
 class RequestHandlerError(Exception, Generic[TCrawlingContext]):
  """Wraps an exception thrown from a request handler (router) and extends it with crawling context."""
 

diff --git a/src/crawlee/http_clients/__init__.py b/src/crawlee/http_clients/__init__.py
@@ -1,4 +1,12 @@
-from .base_http_client import BaseHttpClient, HttpCrawlingResult, HttpResponse
-from .httpx_client import HttpxClient
+from .base import BaseHttpClient, HttpCrawlingResult, HttpResponse
+from .httpx import HttpxHttpClient
 
-__all__ = ['BaseHttpClient', 'HttpCrawlingResult', 'HttpResponse', 'HttpxClient']
+try:
+ from .curl_impersonate import CurlImpersonateHttpClient
+except ImportError as exc:
+ raise ImportError(
+ "To import anything from this subpackage, you need to install the 'curl-impersonate' extra."
+ "For example, if you use pip, run `pip install 'crawlee[curl-impersonate]'`.",
+ ) from exc
+
+__all__ = ['BaseHttpClient', 'CurlImpersonateHttpClient', 'HttpCrawlingResult', 'HttpResponse', 'HttpxHttpClient']