-
-
Notifications
You must be signed in to change notification settings - Fork 11k
Remove V0 Ray executor and migrate V1 implementation #25213
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@ruisearch42 PTAL |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Code Review
This pull request effectively removes the v0 Ray executor and consolidates its logic into a standalone v1 implementation. The change to use a non-blocking FutureWrapper for asynchronous operations is a significant improvement for performance and responsiveness. My main feedback is that the migration has introduced a substantial amount of dead code from the v0 executor's non-SPMD mode, which is not supported in the v1 executor. Removing this dead code would greatly improve the maintainability and clarity of the new RayDistributedExecutor.
| if not self.use_ray_compiled_dag: | ||
| self.driver_exec_method = make_async( | ||
| self.driver_worker.execute_method) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if not self.use_ray_spmd_worker: | ||
| raise RuntimeError( | ||
| "RayDistributedExecutor in v1 requires " | ||
| "VLLM_USE_RAY_SPMD_WORKER=1") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if not self.use_ray_spmd_worker: | ||
| for i, each in enumerate(worker_metadata): | ||
| # find and remove the dummy worker from the list | ||
| worker = each.worker | ||
| worker_ip = each.ip | ||
| if self.driver_dummy_worker is None and worker_ip == driver_ip: | ||
| # If the worker is on the same node as the driver, we use it | ||
| # as the resource holder for the driver process. | ||
| self.driver_dummy_worker = worker | ||
| self.driver_worker = RayWorkerWrapper( | ||
| vllm_config=self.vllm_config, rpc_rank=0) | ||
| worker_metadata.pop(i) | ||
| break | ||
|
|
||
| logger.debug("workers: %s", worker_metadata) | ||
| logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker) | ||
| if not self.use_ray_spmd_worker and self.driver_dummy_worker is None: | ||
| raise ValueError( | ||
| "Ray does not allocate any GPUs on the driver node." | ||
| f"Driver IP: {driver_ip}, worker IPs: {worker_ips}." | ||
| "Consider adjusting the Ray placement group or running " | ||
| "the driver on a GPU node.") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| # node will be placed first. | ||
| sorted_worker_metadata = sorted(worker_metadata, | ||
| key=sort_by_driver_then_worker_ip) | ||
| start_rank = 0 if self.use_ray_spmd_worker else 1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| def _driver_execute_model( | ||
| self, execute_model_req: Optional[ExecuteModelRequest] | ||
| ) -> Optional[List[SamplerOutput]]: | ||
| """Run execute_model in the driver worker.""" | ||
|
|
||
| assert not self.use_ray_spmd_worker, ( | ||
| "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1") | ||
| return self.driver_worker.execute_method("execute_model", | ||
| execute_model_req) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if not self.use_ray_spmd_worker: | ||
| # Start the driver worker after all the ray workers. | ||
| driver_worker_output = [ | ||
| self.driver_worker.execute_method(sent_method, *args, **kwargs) | ||
| ] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| async def _driver_execute_model_async( | ||
| self, | ||
| execute_model_req: Optional[ExecuteModelRequest] = None | ||
| ) -> List[SamplerOutput]: | ||
| assert not self.use_ray_spmd_worker, ( | ||
| "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1") | ||
| if not self.tp_driver_workers: | ||
| return await self.driver_exec_method("execute_model", | ||
| execute_model_req) | ||
| if self.pp_locks is None: | ||
| self.pp_locks = [ | ||
| asyncio.Lock() | ||
| for _ in range(self.parallel_config.pipeline_parallel_size) | ||
| ] | ||
|
|
||
| tasks = [ | ||
| asyncio.create_task( | ||
| _run_task_with_lock(self.driver_exec_method, self.pp_locks[0], | ||
| "execute_model", execute_model_req)) | ||
| ] | ||
| for pp_rank, driver_worker in enumerate(self.tp_driver_workers, | ||
| start=1): | ||
| tasks.append( | ||
| asyncio.create_task( | ||
| _run_task_with_lock(driver_worker.execute_method.remote, | ||
| self.pp_locks[pp_rank], | ||
| "execute_model", execute_model_req))) | ||
|
|
||
| results = await asyncio.gather(*tasks) | ||
|
|
||
| # Only the last PP stage has the final results. | ||
| return results[-1] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| async def _start_worker_execution_loop(self): | ||
| assert not self.use_ray_spmd_worker, ( | ||
| "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1") | ||
| coros = [ | ||
| worker.execute_method.remote("start_worker_execution_loop") | ||
| for worker in self.non_driver_workers | ||
| ] | ||
| return await asyncio.gather(*coros) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
|
@codex fix comments |
|
Summary
Testing
|
|
This pull request has merge conflicts that must be resolved before it can be |
|
Ray V0 executor has now been removed in #27142, but that didn't include the
|
|
Sorry I think I missed Simon's earlier ping. The |
Summary
Testing
https://chatgpt.com/codex/tasks/task_e_68cc992ce0348329bd2b32d08b588b9c