Skip to content

Commit

Permalink
feat: add new curl impersonate HTTP client (#387)
Browse files Browse the repository at this point in the history
### Description

- Add a new curl impersonate HTTP client utilizing the
[curl-cffi](https://pypi.org/project/curl-cffi/) package.
- Improve API docs of the HTTP clients and define public & private
interfaces.
- I encountered a few bugs or "not great behaviour" of `curl-cffi`,
opening issues:
  - lexiforest/curl_cffi#360
  - lexiforest/curl_cffi#361
- Because of the above bugs, I decided not to set curl impersonate
client as default and stay with HTTPX client.
- I also had to move some general components from the `basic_crawler`
module to the root of the package. Maybe it's not good, so I am open to
other options on how to sort it out.

### Issues

- Closes: #292

### Testing

- New unit tests were written.
- Or check the example below utilizing curl impersonate client with
BeautifulSoup crawler.

```python
import asyncio

from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.http_clients import CurlImpersonateHttpClient
from crawlee.proxy_configuration import ProxyConfiguration


async def main() -> None:
    proxy_configuration = ProxyConfiguration(
        proxy_urls=[
            'http://username:password@proxy.apify.com:8000',
        ],
    )

    crawler = BeautifulSoupCrawler(
        max_requests_per_crawl=10,
        proxy_configuration=proxy_configuration,
        http_client=CurlImpersonateHttpClient(),
    )

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url}...')
        await context.enqueue_links()

        data = {
            'url': context.request.url,
            'title': context.soup.title.text if context.soup.title else '',
        }

        context.log.info(f'Extracted data: {data}')
        await context.push_data(data)

    await crawler.run(['https://apify.com', 'https://crawlee.dev/'])
    crawler.log.info('Finished crawling.')


if __name__ == '__main__':
    asyncio.run(main())
```

### TODO

- [ ] Before merging add better documentation of HTTP clients, an option
of switching them, and implementing a new one.

### Checklist

- [x] CI passed
  • Loading branch information
vdusek committed Aug 5, 2024
1 parent 3b2df6c commit 9c06260
Show file tree
Hide file tree
Showing 36 changed files with 936 additions and 526 deletions.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ aioshutil = "^1.3"
beautifulsoup4 = { version = "^4.12.3", optional = true }
colorama = "^0.4.6"
cookiecutter = "^2.6.0"
curl-cffi = { version = "^0.7.1", optional = true }
docutils = "^0.21.0"
eval-type-backport = "^0.2.0"
html5lib = { version = "^1.1", optional = true }
Expand Down Expand Up @@ -93,6 +94,7 @@ types-python-dateutil = "~2.9.0.20240316"

[tool.poetry.extras]
beautifulsoup = ["beautifulsoup4", "lxml", "html5lib"]
curl-impersonate = ["curl-cffi"]
playwright = ["playwright"]

[tool.poetry.scripts]
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/_utils/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def validate_http_url(value: str | None) -> str | None:
"""Validate the given HTTP URL.
Raises:
pydantic.error_wrappers.ValidationError: If the URL is not valid.
pydantic.ValidationError: If the URL is not valid.
"""
if value is not None:
_http_url_adapter.validate_python(value)
Expand Down
3 changes: 1 addition & 2 deletions src/crawlee/basic_crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from .basic_crawler import BasicCrawler, BasicCrawlerOptions
from .context_pipeline import ContextPipeline
from .router import Router
from .types import BasicCrawlingContext

__all__ = ['BasicCrawler', 'BasicCrawlerOptions', 'ContextPipeline', 'Router', 'BasicCrawlingContext']
__all__ = ['BasicCrawler', 'BasicCrawlerOptions', 'ContextPipeline', 'Router']
24 changes: 12 additions & 12 deletions src/crawlee/basic_crawler/basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,24 @@
from crawlee.autoscaling.snapshotter import Snapshotter
from crawlee.autoscaling.system_status import SystemStatus
from crawlee.basic_crawler.context_pipeline import ContextPipeline
from crawlee.basic_crawler.errors import (
from crawlee.basic_crawler.router import Router
from crawlee.configuration import Configuration
from crawlee.enqueue_strategy import EnqueueStrategy
from crawlee.errors import (
ContextPipelineInitializationError,
ContextPipelineInterruptedError,
RequestHandlerError,
SessionError,
UserDefinedErrorHandlerError,
)
from crawlee.basic_crawler.router import Router
from crawlee.basic_crawler.types import BasicCrawlingContext, RequestHandlerRunResult, SendRequestFunction
from crawlee.configuration import Configuration
from crawlee.enqueue_strategy import EnqueueStrategy
from crawlee.events import LocalEventManager
from crawlee.http_clients import HttpxClient
from crawlee.http_clients import HttpxHttpClient
from crawlee.log_config import CrawleeLogFormatter
from crawlee.models import BaseRequestData, DatasetItemsListPage, Request, RequestState
from crawlee.sessions import SessionPool
from crawlee.statistics import Statistics
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
from crawlee.types import BasicCrawlingContext, HttpHeaders, RequestHandlerRunResult, SendRequestFunction

if TYPE_CHECKING:
import re
Expand All @@ -53,7 +53,7 @@
from crawlee.statistics import FinalStatistics, StatisticsState
from crawlee.storages.dataset import GetDataKwargs, PushDataKwargs
from crawlee.storages.request_provider import RequestProvider
from crawlee.types import JSONSerializable
from crawlee.types import HttpMethod, JSONSerializable

TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Union[Request, None]]]
Expand Down Expand Up @@ -152,7 +152,7 @@ def __init__(
self._router = None
self.router.default_handler(request_handler)

self._http_client = http_client or HttpxClient()
self._http_client = http_client or HttpxHttpClient()

self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)

Expand Down Expand Up @@ -683,13 +683,13 @@ def _prepare_send_request_function(
async def send_request(
url: str,
*,
method: str = 'get',
headers: dict[str, str] | None = None,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
) -> HttpResponse:
return await self._http_client.send_request(
url,
url=url,
method=method,
headers=headers or {},
headers=headers,
session=session,
proxy_info=proxy_info,
)
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/basic_crawler/context_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@

from typing_extensions import TypeVar

from crawlee.basic_crawler.errors import (
from crawlee.errors import (
ContextPipelineFinalizationError,
ContextPipelineInitializationError,
ContextPipelineInterruptedError,
RequestHandlerError,
SessionError,
)
from crawlee.basic_crawler.types import BasicCrawlingContext
from crawlee.types import BasicCrawlingContext

TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
TMiddlewareCrawlingContext = TypeVar('TMiddlewareCrawlingContext', bound=BasicCrawlingContext)
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/basic_crawler/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import Awaitable, Callable, Generic, TypeVar

from crawlee.basic_crawler.types import BasicCrawlingContext
from crawlee.types import BasicCrawlingContext

TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]
Expand Down
156 changes: 0 additions & 156 deletions src/crawlee/basic_crawler/types.py

This file was deleted.

16 changes: 8 additions & 8 deletions src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline
from crawlee.basic_crawler.errors import SessionError
from crawlee.beautifulsoup_crawler.types import BeautifulSoupCrawlingContext
from crawlee.enqueue_strategy import EnqueueStrategy
from crawlee.http_clients import HttpxClient
from crawlee.errors import SessionError
from crawlee.http_clients import HttpxHttpClient
from crawlee.http_crawler import HttpCrawlingContext
from crawlee.models import BaseRequestData

if TYPE_CHECKING:
from crawlee.basic_crawler.types import AddRequestsKwargs, BasicCrawlingContext
from crawlee.types import AddRequestsKwargs, BasicCrawlingContext


class BeautifulSoupCrawler(BasicCrawler[BeautifulSoupCrawlingContext]):
Expand Down Expand Up @@ -55,7 +55,7 @@ def __init__(

kwargs.setdefault(
'http_client',
HttpxClient(
HttpxHttpClient(
additional_http_error_status_codes=additional_http_error_status_codes,
ignore_http_error_status_codes=ignore_http_error_status_codes,
),
Expand All @@ -67,10 +67,10 @@ def __init__(

async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
result = await self._http_client.crawl(
context.request,
context.session,
context.proxy_info,
self._statistics,
request=context.request,
session=context.session,
proxy_info=context.proxy_info,
statistics=self._statistics,
)

yield HttpCrawlingContext(
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/beautifulsoup_crawler/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING

from crawlee.basic_crawler.types import BasicCrawlingContext, EnqueueLinksFunction
from crawlee.http_crawler import HttpCrawlingResult
from crawlee.types import BasicCrawlingContext, EnqueueLinksFunction

if TYPE_CHECKING:
from bs4 import BeautifulSoup
Expand Down
11 changes: 7 additions & 4 deletions src/crawlee/basic_crawler/errors.py → src/crawlee/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,15 @@

from typing_extensions import TypeVar

from crawlee.basic_crawler.types import BasicCrawlingContext
from crawlee.types import BasicCrawlingContext

TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)


class UserDefinedErrorHandlerError(Exception):
"""Wraps an exception thrown from an user-defined error handler."""


TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)


class SessionError(Exception):
"""Errors of `SessionError` type will trigger a session rotation.
Expand All @@ -25,6 +24,10 @@ class ProxyError(SessionError):
"""Raised when a proxy is being blocked or malfunctions."""


class HttpStatusCodeError(Exception):
"""Raised when the response status code indicates an error."""


class RequestHandlerError(Exception, Generic[TCrawlingContext]):
"""Wraps an exception thrown from a request handler (router) and extends it with crawling context."""

Expand Down
14 changes: 11 additions & 3 deletions src/crawlee/http_clients/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
from .base_http_client import BaseHttpClient, HttpCrawlingResult, HttpResponse
from .httpx_client import HttpxClient
from .base import BaseHttpClient, HttpCrawlingResult, HttpResponse
from .httpx import HttpxHttpClient

__all__ = ['BaseHttpClient', 'HttpCrawlingResult', 'HttpResponse', 'HttpxClient']
try:
from .curl_impersonate import CurlImpersonateHttpClient
except ImportError as exc:
raise ImportError(
"To import anything from this subpackage, you need to install the 'curl-impersonate' extra."
"For example, if you use pip, run `pip install 'crawlee[curl-impersonate]'`.",
) from exc

__all__ = ['BaseHttpClient', 'CurlImpersonateHttpClient', 'HttpCrawlingResult', 'HttpResponse', 'HttpxHttpClient']
Loading

0 comments on commit 9c06260

Please sign in to comment.