Skip to content

Commit 826f4db

Browse files
vdusekjanbuchar
andauthored
feat: Upgrade to Crawlee v0.5 (#355)
- service locator - updates in proxy configuration --------- Co-authored-by: Jan Buchar <jan.buchar@apify.com>
1 parent d66265c commit 826f4db

22 files changed

+586
-232
lines changed

Diff for: poetry.lock

+304-56
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ keywords = [
4545
python = "^3.9"
4646
apify-client = ">=1.8.1"
4747
apify-shared = ">=1.2.1"
48-
crawlee = "~0.4.0"
48+
crawlee = "~0.5.1"
4949
cryptography = ">=42.0.0"
5050
httpx = ">=0.27.0"
5151
lazy-object-proxy = ">=1.10.0"

Diff for: src/apify/_actor.py

+36-23
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from apify_client import ApifyClientAsync
1414
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
1515
from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value
16-
from crawlee import service_container
16+
from crawlee import service_locator
1717
from crawlee.events._types import Event, EventMigratingData, EventPersistStateData
1818

1919
from apify._configuration import Configuration
@@ -34,6 +34,7 @@
3434
from typing_extensions import Self
3535

3636
from crawlee.proxy_configuration import _NewUrlFunction
37+
from crawlee.storage_clients import BaseStorageClient
3738

3839
from apify._models import Webhook
3940

@@ -71,17 +72,22 @@ def __init__(
7172
self._configure_logging = configure_logging
7273
self._apify_client = self.new_client()
7374

74-
self._event_manager: EventManager
75-
if self._configuration.is_at_home:
76-
self._event_manager = PlatformEventManager(
75+
# Create an instance of the cloud storage client, the local storage client is obtained
76+
# from the service locator.
77+
self._cloud_storage_client = ApifyStorageClient.from_config(config=self._configuration)
78+
79+
# Set the event manager based on whether the Actor is running on the platform or locally.
80+
self._event_manager = (
81+
PlatformEventManager(
7782
config=self._configuration,
7883
persist_state_interval=self._configuration.persist_state_interval,
7984
)
80-
else:
81-
self._event_manager = LocalEventManager(
85+
if self.is_at_home()
86+
else LocalEventManager(
8287
system_info_interval=self._configuration.system_info_interval,
8388
persist_state_interval=self._configuration.persist_state_interval,
8489
)
90+
)
8591

8692
self._is_initialized = False
8793

@@ -94,9 +100,6 @@ async def __aenter__(self) -> Self:
94100
When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while
95101
executing the block code, the `Actor.fail` method is called.
96102
"""
97-
if self._configure_logging:
98-
_configure_logging(self._configuration)
99-
100103
await self.init()
101104
return self
102105

@@ -156,6 +159,11 @@ def log(self) -> logging.Logger:
156159
"""The logging.Logger instance the Actor uses."""
157160
return logger
158161

162+
@property
163+
def _local_storage_client(self) -> BaseStorageClient:
164+
"""The local storage client the Actor instance uses."""
165+
return service_locator.get_storage_client()
166+
159167
def _raise_if_not_initialized(self) -> None:
160168
if not self._is_initialized:
161169
raise RuntimeError('The Actor was not initialized!')
@@ -184,18 +192,19 @@ async def init(self) -> None:
184192
if self._is_initialized:
185193
raise RuntimeError('The Actor was already initialized!')
186194

187-
if self._configuration.token:
188-
service_container.set_cloud_storage_client(ApifyStorageClient(configuration=self._configuration))
195+
self._is_exiting = False
196+
self._was_final_persist_state_emitted = False
189197

190-
if self._configuration.is_at_home:
191-
service_container.set_default_storage_client_type('cloud')
192-
else:
193-
service_container.set_default_storage_client_type('local')
198+
# If the Actor is running on the Apify platform, we set the cloud storage client.
199+
if self.is_at_home():
200+
service_locator.set_storage_client(self._cloud_storage_client)
194201

195-
service_container.set_event_manager(self._event_manager)
202+
service_locator.set_event_manager(self.event_manager)
203+
service_locator.set_configuration(self.configuration)
196204

197-
self._is_exiting = False
198-
self._was_final_persist_state_emitted = False
205+
# The logging configuration has to be called after all service_locator set methods.
206+
if self._configure_logging:
207+
_configure_logging()
199208

200209
self.log.info('Initializing Actor...')
201210
self.log.info('System info', extra=get_system_info())
@@ -245,7 +254,6 @@ async def finalize() -> None:
245254
await self._event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout)
246255

247256
await self._event_manager.__aexit__(None, None, None)
248-
cast(dict, service_container._services).clear() # noqa: SLF001
249257

250258
await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds())
251259
self._is_initialized = False
@@ -349,11 +357,13 @@ async def open_dataset(
349357
self._raise_if_not_initialized()
350358
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
351359

360+
storage_client = self._cloud_storage_client if force_cloud else self._local_storage_client
361+
352362
return await Dataset.open(
353363
id=id,
354364
name=name,
355365
configuration=self._configuration,
356-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
366+
storage_client=storage_client,
357367
)
358368

359369
async def open_key_value_store(
@@ -381,12 +391,13 @@ async def open_key_value_store(
381391
"""
382392
self._raise_if_not_initialized()
383393
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
394+
storage_client = self._cloud_storage_client if force_cloud else self._local_storage_client
384395

385396
return await KeyValueStore.open(
386397
id=id,
387398
name=name,
388399
configuration=self._configuration,
389-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
400+
storage_client=storage_client,
390401
)
391402

392403
async def open_request_queue(
@@ -417,11 +428,13 @@ async def open_request_queue(
417428
self._raise_if_not_initialized()
418429
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
419430

431+
storage_client = self._cloud_storage_client if force_cloud else self._local_storage_client
432+
420433
return await RequestQueue.open(
421434
id=id,
422435
name=name,
423436
configuration=self._configuration,
424-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
437+
storage_client=storage_client,
425438
)
426439

427440
async def push_data(self, data: dict | list[dict]) -> None:
@@ -963,7 +976,7 @@ async def create_proxy_configuration(
963976
password: str | None = None,
964977
groups: list[str] | None = None,
965978
country_code: str | None = None,
966-
proxy_urls: list[str] | None = None,
979+
proxy_urls: list[str | None] | None = None,
967980
new_url_function: _NewUrlFunction | None = None,
968981
) -> ProxyConfiguration | None:
969982
"""Create a ProxyConfiguration object with the passed proxy configuration.

Diff for: src/apify/_configuration.py

+12
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
from datetime import datetime, timedelta
4+
from logging import getLogger
45
from typing import Annotated, Any
56

67
from pydantic import AliasChoices, BeforeValidator, Field
@@ -12,6 +13,8 @@
1213

1314
from apify._utils import docs_group
1415

16+
logger = getLogger(__name__)
17+
1518

1619
def _transform_to_list(value: Any) -> list[str] | None:
1720
if value is None:
@@ -353,6 +356,15 @@ class Configuration(CrawleeConfiguration):
353356
),
354357
] = None
355358

359+
@classmethod
360+
def get_global_configuration(cls) -> Configuration:
361+
"""Retrieve the global instance of the configuration.
362+
363+
Mostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`
364+
instead.
365+
"""
366+
return cls()
367+
356368

357369
# Monkey-patch the base class so that it works with the extended configuration
358370
CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore[method-assign]

Diff for: src/apify/_proxy_configuration.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,9 @@ def __init__(
111111
password: str | None = None,
112112
groups: list[str] | None = None,
113113
country_code: str | None = None,
114-
proxy_urls: list[str] | None = None,
114+
proxy_urls: list[str | None] | None = None,
115115
new_url_function: _NewUrlFunction | None = None,
116-
tiered_proxy_urls: list[list[str]] | None = None,
116+
tiered_proxy_urls: list[list[str | None]] | None = None,
117117
_actor_config: Configuration | None = None,
118118
_apify_client: ApifyClientAsync | None = None,
119119
) -> None:
@@ -148,7 +148,7 @@ def __init__(
148148
' "groups" or "country_code".'
149149
)
150150

151-
if proxy_urls and any('apify.com' in url for url in proxy_urls):
151+
if proxy_urls and any('apify.com' in (url or '') for url in proxy_urls):
152152
logger.warning(
153153
'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties '
154154
'instead of `proxy_urls`.\n'

Diff for: src/apify/apify_storage_client/_apify_storage_client.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
15
from typing_extensions import override
26

37
from apify_client import ApifyClientAsync
48
from crawlee._utils.crypto import crypto_random_object_id
5-
from crawlee.base_storage_client import BaseStorageClient
9+
from crawlee.storage_clients import BaseStorageClient
610

7-
from apify._configuration import Configuration
811
from apify._utils import docs_group
912
from apify.apify_storage_client._dataset_client import DatasetClient
1013
from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient
@@ -13,6 +16,9 @@
1316
from apify.apify_storage_client._request_queue_client import RequestQueueClient
1417
from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient
1518

19+
if TYPE_CHECKING:
20+
from apify._configuration import Configuration
21+
1622

1723
@docs_group('Classes')
1824
class ApifyStorageClient(BaseStorageClient):
@@ -29,6 +35,10 @@ def __init__(self, *, configuration: Configuration) -> None:
2935
)
3036
self._configuration = configuration
3137

38+
@classmethod
39+
def from_config(cls, config: Configuration) -> ApifyStorageClient:
40+
return cls(configuration=config)
41+
3242
@override
3343
def dataset(self, id: str) -> DatasetClient:
3444
return DatasetClient(self._apify_client.dataset(id))

Diff for: src/apify/apify_storage_client/_dataset_client.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
from typing_extensions import override
66

7-
from crawlee.base_storage_client import BaseDatasetClient, DatasetItemsListPage, DatasetMetadata
7+
from crawlee.storage_clients._base import BaseDatasetClient
8+
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
89

910
if TYPE_CHECKING:
1011
from collections.abc import AsyncIterator

Diff for: src/apify/apify_storage_client/_dataset_collection_client.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
from typing_extensions import override
66

7-
from crawlee.base_storage_client import BaseDatasetCollectionClient, DatasetListPage, DatasetMetadata
7+
from crawlee.storage_clients._base import BaseDatasetCollectionClient
8+
from crawlee.storage_clients.models import DatasetListPage, DatasetMetadata
89

910
if TYPE_CHECKING:
1011
from apify_client.clients import DatasetCollectionClientAsync

Diff for: src/apify/apify_storage_client/_key_value_store_client.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,8 @@
55

66
from typing_extensions import override
77

8-
from crawlee.base_storage_client import (
9-
BaseKeyValueStoreClient,
10-
KeyValueStoreListKeysPage,
11-
KeyValueStoreMetadata,
12-
KeyValueStoreRecord,
13-
)
8+
from crawlee.storage_clients._base import BaseKeyValueStoreClient
9+
from crawlee.storage_clients.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord
1410

1511
if TYPE_CHECKING:
1612
from collections.abc import AsyncIterator

Diff for: src/apify/apify_storage_client/_key_value_store_collection_client.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
from typing_extensions import override
66

7-
from crawlee.base_storage_client import BaseKeyValueStoreCollectionClient, KeyValueStoreListPage, KeyValueStoreMetadata
7+
from crawlee.storage_clients._base import BaseKeyValueStoreCollectionClient
8+
from crawlee.storage_clients.models import KeyValueStoreListPage, KeyValueStoreMetadata
89

910
if TYPE_CHECKING:
1011
from apify_client.clients import KeyValueStoreCollectionClientAsync

Diff for: src/apify/apify_storage_client/_request_queue_client.py

+2-22
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
from typing_extensions import override
66

77
from crawlee import Request
8-
from crawlee.base_storage_client import (
9-
BaseRequestQueueClient,
8+
from crawlee.storage_clients._base import BaseRequestQueueClient
9+
from crawlee.storage_clients.models import (
1010
BatchRequestsOperationResponse,
1111
ProcessedRequest,
1212
ProlongRequestLockResponse,
@@ -80,10 +80,6 @@ async def add_request(
8080
by_alias=True,
8181
exclude={
8282
'id',
83-
'json_',
84-
'order_no',
85-
'query_params',
86-
'data',
8783
},
8884
),
8985
forefront=forefront,
@@ -107,12 +103,6 @@ async def update_request(
107103
| await self._client.update_request(
108104
request=request.model_dump(
109105
by_alias=True,
110-
exclude={
111-
'json_',
112-
'order_no',
113-
'query_params',
114-
'data',
115-
},
116106
),
117107
forefront=forefront,
118108
)
@@ -164,10 +154,6 @@ async def batch_add_requests(
164154
by_alias=True,
165155
exclude={
166156
'id',
167-
'json_',
168-
'order_no',
169-
'query_params',
170-
'data',
171157
},
172158
)
173159
for r in requests
@@ -183,12 +169,6 @@ async def batch_delete_requests(self, requests: list[Request]) -> BatchRequestsO
183169
requests=[
184170
r.model_dump(
185171
by_alias=True,
186-
exclude={
187-
'json_',
188-
'order_no',
189-
'query_params',
190-
'data',
191-
},
192172
)
193173
for r in requests
194174
],

Diff for: src/apify/apify_storage_client/_request_queue_collection_client.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
from typing_extensions import override
66

7-
from crawlee.base_storage_client import BaseRequestQueueCollectionClient, RequestQueueListPage, RequestQueueMetadata
7+
from crawlee.storage_clients._base import BaseRequestQueueCollectionClient
8+
from crawlee.storage_clients.models import RequestQueueListPage, RequestQueueMetadata
89

910
if TYPE_CHECKING:
1011
from apify_client.clients import RequestQueueCollectionClientAsync

0 commit comments

Comments
 (0)