Skip to content

Commit

Permalink
docs: Update input arg docstrings of BasicCrawler and EventManager (
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek authored Oct 30, 2024
1 parent bde0800 commit f9463e7
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 29 deletions.
95 changes: 69 additions & 26 deletions src/crawlee/basic_crawler/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,27 +64,71 @@


class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
"""Copy of the parameter types of `BasicCrawler.__init__` meant for typing forwarded __init__ args in subclasses."""
"""Arguments for the `BasicCrawler` constructor.
It is intended for typing forwarded `__init__` arguments in the subclasses.
"""

request_provider: NotRequired[RequestProvider]
"""Provider for requests to be processed by the crawler."""

request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]
"""A callable responsible for handling requests."""

http_client: NotRequired[BaseHttpClient]
"""HTTP client used by `BasicCrawlingContext.send_request` and the HTTP-based crawling."""

concurrency_settings: NotRequired[ConcurrencySettings]
"""Settings to fine-tune concurrency levels."""

max_request_retries: NotRequired[int]
"""Maximum number of attempts to process a single request."""

max_requests_per_crawl: NotRequired[int | None]
"""Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.
Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.
Due to concurrency settings, the actual number of pages visited may slightly exceed this value."""

max_session_rotations: NotRequired[int]
"""Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs
or if the website blocks the request."""

configuration: NotRequired[Configuration]
"""Crawler configuration."""

request_handler_timeout: NotRequired[timedelta]
session_pool: NotRequired[SessionPool]
"""Maximum duration allowed for a single request handler to run."""

use_session_pool: NotRequired[bool]
"""Enable the use of a session pool for managing sessions during crawling."""

session_pool: NotRequired[SessionPool]
"""A custom `SessionPool` instance, allowing the use of non-default configuration."""

retry_on_blocked: NotRequired[bool]
"""If True, the crawler attempts to bypass bot protections automatically."""

proxy_configuration: NotRequired[ProxyConfiguration]
"""HTTP proxy configuration used when making requests."""

statistics: NotRequired[Statistics[StatisticsState]]
"""A custom `Statistics` instance, allowing the use of non-default configuration."""

event_manager: NotRequired[EventManager]
"""A custom `EventManager` instance, allowing the use of non-default configuration."""

configure_logging: NotRequired[bool]
"""If True, the crawler will set up logging infrastructure automatically."""

_context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]]
"""Enables extending the request lifecycle and modifying the crawling context. Intended for use by
subclasses rather than direct instantiation of `BasicCrawler`."""

_additional_context_managers: NotRequired[Sequence[AsyncContextManager]]
"""Additional context managers used throughout the crawler lifecycle."""

_logger: NotRequired[logging.Logger]
"""A logger instance, typically provided by a subclass, for consistent logging labels."""


class BasicCrawler(Generic[TCrawlingContext]):
Expand Down Expand Up @@ -137,31 +181,30 @@ def __init__(
"""A default constructor.
Args:
request_provider: Provides requests to be processed.
request_handler: A callable to which request handling is delegated.
http_client: HTTP client to be used for `BasicCrawlingContext.send_request` and HTTP-only crawling.
concurrency_settings: Allows fine-tuning concurrency levels.
max_request_retries: Maximum amount of attempts at processing a request.
max_requests_per_crawl: Maximum number of pages that the crawler will open. The crawl will stop when
the limit is reached. It is recommended to set this value in order to prevent infinite loops in
misconfigured crawlers. None means no limit. Due to concurrency_settings, the actual number of pages
visited may slightly exceed this value.
max_session_rotations: Maximum number of session rotations per request.
The crawler will automatically rotate the session in case of a proxy error or if it gets blocked by
the website.
request_provider: Provider for requests to be processed by the crawler.
request_handler: A callable responsible for handling requests.
http_client: HTTP client used by `BasicCrawlingContext.send_request` and the HTTP-based crawling.
concurrency_settings: Settings to fine-tune concurrency levels.
max_request_retries: Maximum number of attempts to process a single request.
max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed
this value.
max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
if a proxy error occurs or if the website blocks the request.
configuration: Crawler configuration.
request_handler_timeout: How long a single request handler is allowed to run.
use_session_pool: Enables using the session pool for crawling.
session_pool: A preconfigured `SessionPool` instance if you wish to use non-default configuration.
retry_on_blocked: If set to True, the crawler will try to automatically bypass any detected bot protection.
proxy_configuration: A HTTP proxy configuration to be used for making requests.
statistics: A preconfigured `Statistics` instance if you wish to use non-default configuration.
event_manager: A custom `EventManager` instance if you wish to use a non-default one.
configure_logging: If set to True, the crawler will configure the logging infrastructure.
_context_pipeline: Allows extending the request lifecycle and modifying the crawling context.
This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
_additional_context_managers: Additional context managers to be used in the crawler lifecycle.
_logger: A logger instance passed from a child class to ensure consistent labels.
request_handler_timeout: Maximum duration allowed for a single request handler to run.
use_session_pool: Enable the use of a session pool for managing sessions during crawling.
session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
proxy_configuration: HTTP proxy configuration used when making requests.
statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
event_manager: A custom `EventManager` instance, allowing the use of non-default configuration.
configure_logging: If True, the crawler will set up logging infrastructure automatically.
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
_logger: A logger instance, typically provided by a subclass, for consistent logging labels.
"""
self._router: Router[TCrawlingContext] | None = None

Expand Down
12 changes: 9 additions & 3 deletions src/crawlee/events/_event_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,16 @@


class EventManagerOptions(TypedDict):
"""Parameter types for subclass __init__ methods, copied from EventManager.__init__."""
"""Arguments for the `EventManager` constructor.
It is intended for typing forwarded `__init__` arguments in the subclasses.
"""

persist_state_interval: NotRequired[timedelta]
"""Interval between emitted `PersistState` events to maintain state persistence."""

close_timeout: NotRequired[timedelta | None]
"""Optional timeout for canceling pending event listeners if they exceed this duration."""


class EventManager:
Expand All @@ -47,8 +53,8 @@ def __init__(
"""A default constructor.
Args:
persist_state_interval: Interval at which `PersistState` events are emitted.
close_timeout: Optional timeout after which the pending event listeners are canceled.
persist_state_interval: Interval between emitted `PersistState` events to maintain state persistence.
close_timeout: Optional timeout for canceling pending event listeners if they exceed this duration.
"""
self._persist_state_interval = persist_state_interval
self._close_timeout = close_timeout
Expand Down

0 comments on commit f9463e7

Please sign in to comment.