docs: Update input arg docstrings of BasicCrawler and EventManager (

#633)
apify · Oct 30, 2024 · f9463e7 · f9463e7
1 parent bde0800
commit f9463e7
Show file tree

Hide file tree

Showing 2 changed files with 78 additions and 29 deletions.
diff --git a/src/crawlee/basic_crawler/_basic_crawler.py b/src/crawlee/basic_crawler/_basic_crawler.py
@@ -64,27 +64,71 @@
 
 
 class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
-    """Copy of the parameter types of `BasicCrawler.__init__` meant for typing forwarded __init__ args in subclasses."""
+    """Arguments for the `BasicCrawler` constructor.
+
+    It is intended for typing forwarded `__init__` arguments in the subclasses.
+    """
 
     request_provider: NotRequired[RequestProvider]
+    """Provider for requests to be processed by the crawler."""
+
     request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]
+    """A callable responsible for handling requests."""
+
     http_client: NotRequired[BaseHttpClient]
+    """HTTP client used by `BasicCrawlingContext.send_request` and the HTTP-based crawling."""
+
     concurrency_settings: NotRequired[ConcurrencySettings]
+    """Settings to fine-tune concurrency levels."""
+
     max_request_retries: NotRequired[int]
+    """Maximum number of attempts to process a single request."""
+
     max_requests_per_crawl: NotRequired[int | None]
+    """Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.
+    Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means no limit.
+    Due to concurrency settings, the actual number of pages visited may slightly exceed this value."""
+
     max_session_rotations: NotRequired[int]
+    """Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs
+    or if the website blocks the request."""
+
     configuration: NotRequired[Configuration]
+    """Crawler configuration."""
+
     request_handler_timeout: NotRequired[timedelta]
-    session_pool: NotRequired[SessionPool]
+    """Maximum duration allowed for a single request handler to run."""
+
     use_session_pool: NotRequired[bool]
+    """Enable the use of a session pool for managing sessions during crawling."""
+
+    session_pool: NotRequired[SessionPool]
+    """A custom `SessionPool` instance, allowing the use of non-default configuration."""
+
     retry_on_blocked: NotRequired[bool]
+    """If True, the crawler attempts to bypass bot protections automatically."""
+
     proxy_configuration: NotRequired[ProxyConfiguration]
+    """HTTP proxy configuration used when making requests."""
+
     statistics: NotRequired[Statistics[StatisticsState]]
+    """A custom `Statistics` instance, allowing the use of non-default configuration."""
+
     event_manager: NotRequired[EventManager]
+    """A custom `EventManager` instance, allowing the use of non-default configuration."""
+
     configure_logging: NotRequired[bool]
+    """If True, the crawler will set up logging infrastructure automatically."""
+
     _context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]]
+    """Enables extending the request lifecycle and modifying the crawling context. Intended for use by
+    subclasses rather than direct instantiation of `BasicCrawler`."""
+
     _additional_context_managers: NotRequired[Sequence[AsyncContextManager]]
+    """Additional context managers used throughout the crawler lifecycle."""
+
     _logger: NotRequired[logging.Logger]
+    """A logger instance, typically provided by a subclass, for consistent logging labels."""
 
 
 class BasicCrawler(Generic[TCrawlingContext]):
@@ -137,31 +181,30 @@ def __init__(
         """A default constructor.
 
         Args:
-            request_provider: Provides requests to be processed.
-            request_handler: A callable to which request handling is delegated.
-            http_client: HTTP client to be used for `BasicCrawlingContext.send_request` and HTTP-only crawling.
-            concurrency_settings: Allows fine-tuning concurrency levels.
-            max_request_retries: Maximum amount of attempts at processing a request.
-            max_requests_per_crawl: Maximum number of pages that the crawler will open. The crawl will stop when
-                the limit is reached. It is recommended to set this value in order to prevent infinite loops in
-                misconfigured crawlers. None means no limit. Due to concurrency_settings, the actual number of pages
-                visited may slightly exceed this value.
-            max_session_rotations: Maximum number of session rotations per request.
-                The crawler will automatically rotate the session in case of a proxy error or if it gets blocked by
-                the website.
+            request_provider: Provider for requests to be processed by the crawler.
+            request_handler: A callable responsible for handling requests.
+            http_client: HTTP client used by `BasicCrawlingContext.send_request` and the HTTP-based crawling.
+            concurrency_settings: Settings to fine-tune concurrency levels.
+            max_request_retries: Maximum number of attempts to process a single request.
+            max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
+                this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
+                no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed
+                this value.
+            max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
+                if a proxy error occurs or if the website blocks the request.
             configuration: Crawler configuration.
-            request_handler_timeout: How long a single request handler is allowed to run.
-            use_session_pool: Enables using the session pool for crawling.
-            session_pool: A preconfigured `SessionPool` instance if you wish to use non-default configuration.
-            retry_on_blocked: If set to True, the crawler will try to automatically bypass any detected bot protection.
-            proxy_configuration: A HTTP proxy configuration to be used for making requests.
-            statistics: A preconfigured `Statistics` instance if you wish to use non-default configuration.
-            event_manager: A custom `EventManager` instance if you wish to use a non-default one.
-            configure_logging: If set to True, the crawler will configure the logging infrastructure.
-            _context_pipeline: Allows extending the request lifecycle and modifying the crawling context.
-                This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
-            _additional_context_managers: Additional context managers to be used in the crawler lifecycle.
-            _logger: A logger instance passed from a child class to ensure consistent labels.
+            request_handler_timeout: Maximum duration allowed for a single request handler to run.
+            use_session_pool: Enable the use of a session pool for managing sessions during crawling.
+            session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
+            retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
+            proxy_configuration: HTTP proxy configuration used when making requests.
+            statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
+            event_manager: A custom `EventManager` instance, allowing the use of non-default configuration.
+            configure_logging: If True, the crawler will set up logging infrastructure automatically.
+            _context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
+                Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
+            _additional_context_managers: Additional context managers used throughout the crawler lifecycle.
+            _logger: A logger instance, typically provided by a subclass, for consistent logging labels.
         """
         self._router: Router[TCrawlingContext] | None = None
 

diff --git a/src/crawlee/events/_event_manager.py b/src/crawlee/events/_event_manager.py
@@ -25,10 +25,16 @@
 
 
 class EventManagerOptions(TypedDict):
-    """Parameter types for subclass __init__ methods, copied from EventManager.__init__."""
+    """Arguments for the `EventManager` constructor.
+
+    It is intended for typing forwarded `__init__` arguments in the subclasses.
+    """
 
     persist_state_interval: NotRequired[timedelta]
+    """Interval between emitted `PersistState` events to maintain state persistence."""
+
     close_timeout: NotRequired[timedelta | None]
+    """Optional timeout for canceling pending event listeners if they exceed this duration."""
 
 
 class EventManager:
@@ -47,8 +53,8 @@ def __init__(
         """A default constructor.
 
         Args:
-            persist_state_interval: Interval at which `PersistState` events are emitted.
-            close_timeout: Optional timeout after which the pending event listeners are canceled.
+            persist_state_interval: Interval between emitted `PersistState` events to maintain state persistence.
+            close_timeout: Optional timeout for canceling pending event listeners if they exceed this duration.
         """
         self._persist_state_interval = persist_state_interval
         self._close_timeout = close_timeout