-
Notifications
You must be signed in to change notification settings - Fork 6.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[serve] immediately send ping in router when receiving new replica set #47053
Changes from 5 commits
794e306
3bc3381
ab96e8c
13273fa
f7c0ff1
34aa802
e915ddf
310a9c4
86a332e
d1c41a1
afa5808
2eea743
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,8 +17,10 @@ | |
Tuple, | ||
) | ||
|
||
import ray | ||
from ray.exceptions import ActorDiedError, ActorUnavailableError | ||
from ray.serve._private.common import ( | ||
DeploymentHandleSource, | ||
DeploymentID, | ||
ReplicaID, | ||
RequestMetadata, | ||
|
@@ -89,6 +91,7 @@ def __init__( | |
self, | ||
event_loop: asyncio.AbstractEventLoop, | ||
deployment_id: DeploymentID, | ||
handle_source: DeploymentHandleSource, | ||
prefer_local_node_routing: bool = False, | ||
prefer_local_az_routing: bool = False, | ||
self_node_id: Optional[str] = None, | ||
|
@@ -99,6 +102,7 @@ def __init__( | |
): | ||
self._loop = event_loop | ||
self._deployment_id = deployment_id | ||
self._handle_source = handle_source | ||
self._prefer_local_node_routing = prefer_local_node_routing | ||
self._prefer_local_az_routing = prefer_local_az_routing | ||
self._self_node_id = self_node_id | ||
|
@@ -240,7 +244,16 @@ def update_replicas(self, replicas: List[ReplicaWrapper]): | |
new_replica_id_set = set() | ||
new_colocated_replica_ids = defaultdict(set) | ||
new_multiplexed_model_id_to_replica_ids = defaultdict(set) | ||
|
||
for r in replicas: | ||
# If on the proxy, replica needs to call back into the proxy with | ||
# `receive_asgi_messages` which can be blocked when GCS is down. | ||
# To prevent that from happening, push proxy handle eagerly | ||
if self._handle_source == DeploymentHandleSource.PROXY: | ||
r._actor_handle.push_proxy_handle.remote( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's add a method to the interface, shouldn't be accessing the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that way it can be tested as well |
||
ray.get_runtime_context().current_actor | ||
) | ||
|
||
new_replicas[r.replica_id] = r | ||
new_replica_id_set.add(r.replica_id) | ||
if self._self_node_id is not None and r.node_id == self._self_node_id: | ||
|
@@ -272,6 +285,8 @@ def update_replicas(self, replicas: List[ReplicaWrapper]): | |
self._replica_queue_len_cache.remove_inactive_replicas( | ||
active_replica_ids=new_replica_id_set | ||
) | ||
# Populate cache for all replicas | ||
self._loop.create_task(self._probe_queue_lens(list(self._replicas.values()), 0)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hm can we do this only for the replicas that were added instead of all? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes! for some reason I thought it would mess with the fault tolerance, but seems like the actor info is stored per-process not per actor handle. changed to only ping new replicas. |
||
self._replicas_updated_event.set() | ||
self.maybe_start_scheduling_tasks() | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we do something to the handle? Also maybe add a type hint is it's required 🙃
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
doing something with the handle seems unnecessary for now, I think if you pass any actor handle as an argument in a ray remote call like:
then ray core does some processing under the hood that requires making a call to the GCS, so if this
actor_handle
was never "pushed" to actor beforehand then this call hangs. "Pushing" it once is enough to unblock the call though when the GCS goes down.