Add callback and fix other comments

ryanaoleary · ryanaoleary · commit a34172290ee7 · 2025-08-07T12:35:28.000Z
Signed-off-by: Ryan O'Leary &lt;ryanaoleary@google.com&gt;
diff --git a/python/ray/_private/accelerators/tpu.py b/python/ray/_private/accelerators/tpu.py
@@ -9,7 +9,6 @@
 
 import ray
 from ray._private.accelerators.accelerator import AcceleratorManager
-from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
 logger = logging.getLogger(__name__)
 
@@ -128,24 +127,6 @@ def infer_tpu_pod_type_from_topology(
         return None
 
 
-def fetch_tpu_slice_name_from_pg(pg):
-    @ray.remote(num_cpus=0)
-    def _get_tpu_slice_name():
-        import ray
-
-        return (
-            ray._private.accelerators.TPUAcceleratorManager.get_current_node_tpu_name()
-        )
-
-    tpu_name_ref = _get_tpu_slice_name.options(
-        scheduling_strategy=PlacementGroupSchedulingStrategy(
-            placement_group=pg, placement_group_bundle_index=0
-        )
-    ).remote()
-
-    return ray.get(tpu_name_ref)
-
-
 class TPUAcceleratorManager(AcceleratorManager):
     """Google TPU accelerators."""
 
diff --git a/python/ray/train/v2/BUILD b/python/ray/train/v2/BUILD
@@ -488,7 +488,23 @@ py_test(
 py_test(
     name = "test_jax_trainer",
     size = "small",
-    srcs = ["tests/test_xgboost_trainer.py"],
+    srcs = ["tests/test_jax_trainer.py"],
+    env = {"RAY_TRAIN_V2_ENABLED": "1"},
+    tags = [
+        "exclusive",
+        "team:ml",
+        "train_v2",
+    ],
+    deps = [
+        ":conftest",
+        "//:ray_lib",
+    ],
+)
+
+py_test(
+    name = "test_tpu_utils",
+    size = "small",
+    srcs = ["tests/test_tpu_utils.py"],
     env = {"RAY_TRAIN_V2_ENABLED": "1"},
     tags = [
         "exclusive",
diff --git a/python/ray/train/v2/_internal/callbacks/accelerators.py b/python/ray/train/v2/_internal/callbacks/accelerators.py
@@ -1,28 +1,19 @@
 import logging
 import os
 from collections import defaultdict
-from typing import List, Optional
+from typing import List
 
-import ray
 import ray._private.ray_constants as ray_constants
 from ray._private.accelerators.nvidia_gpu import CUDA_VISIBLE_DEVICES_ENV_VAR
-from ray._private.accelerators.tpu import (
-    fetch_tpu_slice_name_from_pg,
-    infer_tpu_pod_type_from_topology,
-)
-from ray._private.ray_constants import env_bool, env_integer
+from ray._private.ray_constants import env_bool
 from ray.train import BackendConfig
 from ray.train.constants import (
     ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
-    TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV,
 )
 from ray.train.v2._internal.execution.callback import WorkerGroupCallback
 from ray.train.v2._internal.execution.worker_group import ActorMetadata, WorkerGroup
 from ray.train.v2._internal.util import ray_get_safe
 from ray.train.v2.api.config import ScalingConfig
-from ray.util.placement_group import (
-    PlacementGroup,
-)
 
 logger = logging.getLogger(__name__)
 
@@ -161,105 +152,3 @@ def _get_visible_accelerator_ids_per_worker(
         visible_accelerator_ids_per_worker.append(all_resource_ids)
 
     return visible_accelerator_ids_per_worker
-
-
-def reserve_tpu_slice(
-    num_workers: int,
-    resources_per_worker: dict,
-    topology: Optional[str],
-    accelerator_type: Optional[str],
-) -> Optional[PlacementGroup]:
-    """Creates a SPMD-aware placement group. This currently only supports
-    TPU with JaxTrainer by reserving a multi-host slice.
-
-    This creates a head PG (for index 0) that reserves the `TPU-{}-head` resource
-    on the node, retrieves unique slice information from it, and then creates a
-    multi-host slice PG (for index 0..N-1) that reserves the `TPU` resource on all
-    the nodes in the slice. This enables atomic scheduling of TPU workers.
-
-    Args:
-        num_workers: Total number of workers to launch.
-        resources_per_worker: Resource requirements per bundle (e.g. {"CPU": 4}).
-        topology: The TPU topology string (e.g. "2x2x2").
-        accelerator_type: The accelerator type of the node (e.g. "TPU-V4").
-
-    Returns:
-        A PlacementGroup if able to be created, or None.
-    """
-    if not (topology and accelerator_type):
-        return None
-
-    pod_type = infer_tpu_pod_type_from_topology(topology, accelerator_type)
-    if pod_type is None:
-        return None
-
-    # Reserve a slice by creating a placement group on the
-    # TPU head.
-    head_label_selector = {
-        "ray.io/tpu-worker-id": "0",
-        "ray.io/tpu-pod-type": pod_type,
-    }
-    head_placement_group = ray.util.placement_group(
-        bundles=[{f"TPU-{pod_type}-head": 1}],
-        bundle_label_selector=[head_label_selector],
-    )
-
-    logger.debug("Waiting to reserve multi-host slice head.")
-    timeout = env_integer(TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, 100)
-    ready, _ = ray.wait([head_placement_group.ready()], timeout=timeout)
-
-    if not ready:
-        raise TimeoutError(
-            "Failed to reserve TPU head for slice with shape: {}. "
-            "Ensure your cluster has sufficient resources. Requesting TPU "
-            "head node with labels: {}. Current resources: {}".format(
-                pod_type, head_label_selector, ray.available_resources()
-            )
-        )
-
-    if num_workers == 1:
-        logger.debug("Reserved single-host TPU placement group.")
-        return head_placement_group
-
-    # Retrieve the unique slice ID.
-    slice_name = fetch_tpu_slice_name_from_pg(head_placement_group)
-    if slice_name is None:
-        raise RuntimeError(
-            "Failed to retrieve TPU slice name after reserving head placement group. "
-            "Ensure that TPU slice metadata is available and correctly configured on multi-host nodes."
-        )
-    slice_label_selector = {
-        "ray.io/tpu-slice-name": slice_name,
-        "ray.io/tpu-pod-type": pod_type,
-    }
-
-    # Schedule the remaining multi-host workers together with the head bundle.
-    slice_placement_group = ray.util.placement_group(
-        bundles=[resources_per_worker] * num_workers,
-        bundle_label_selector=[slice_label_selector] * num_workers,
-        strategy="SPREAD",
-    )
-    logger.debug("Waiting for multi-host slice placement group to start.")
-    timeout = env_integer(TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, 100)
-    ready, _ = ray.wait([slice_placement_group.ready()], timeout=timeout)
-
-    if ready:
-        logger.debug("SPMD placement groups have started.")
-    else:
-        ray.util.remove_placement_group(head_placement_group)
-        ray.util.remove_placement_group(slice_placement_group)
-        raise TimeoutError(
-            "SPMD Placement group creation timed out. Make sure your "
-            "cluster either has enough resources or use an "
-            "autoscaling cluster. Ensure your cluster has multi-host nodes "
-            "available for SPMD scheduling. "
-            "Current resources available: {}, resources requested by the "
-            "placement groups: {} with labels {}".format(
-                ray.available_resources(),
-                [resources_per_worker] * num_workers,
-                slice_label_selector,
-            )
-        )
-    ray.util.remove_placement_group(head_placement_group)
-
-    return slice_placement_group
diff --git a/python/ray/train/v2/_internal/callbacks/tpu_reservation_callback.py b/python/ray/train/v2/_internal/callbacks/tpu_reservation_callback.py
@@ -0,0 +1,42 @@
+from typing import Dict, Optional
+
+import ray
+from ray.train.v2._internal.execution.callback import ControllerCallback
+from ray.train.v2.api.config import ScalingConfig
+from ray.train.v2.jax.tpu_utils import reserve_tpu_slice
+
+
+class TPUReservationCallback(ControllerCallback):
+    """A callback to handle TPU slice reservation for multi-host training."""
+
+    def on_controller_start_worker_group(
+        self, *, scaling_config: ScalingConfig, num_workers: int
+    ) -> Optional[Dict[str, str]]:
+        """Reserves a multi-host TPU slice before the worker group starts.
+
+        This hook is called by the TrainController. It checks if multi-host
+        TPUs are being used and, if so, reserves a slice.
+
+        Args:
+            scaling_config: The scaling configuration for the run.
+            num_workers: The number of workers to be started.
+
+        Returns:
+            A dictionary defining a `bundle_label_selector` to gang schedule
+            the worker group on the reserved TPU slice.
+        """
+        bundle_label_selector = None
+
+        if getattr(scaling_config, "use_tpu", False) and num_workers > 1:
+            slice_name = reserve_tpu_slice(
+                topology=getattr(scaling_config, "topology", None),
+                accelerator_type=getattr(scaling_config, "accelerator_type", None),
+            )
+            if not slice_name:
+                raise RuntimeError("Failed to reserve TPU slice.")
+
+            bundle_label_selector = {
+                ray._raylet.RAY_NODE_TPU_SLICE_NAME_KEY: slice_name
+            }
+
+        return bundle_label_selector
diff --git a/python/ray/train/v2/_internal/execution/callback.py b/python/ray/train/v2/_internal/execution/callback.py
@@ -2,6 +2,7 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 from ray.train.v2.api.callback import RayTrainCallback
+from ray.train.v2.api.config import ScalingConfig
 from ray.train.v2.api.result import Result
 from ray.util.annotations import DeveloperAPI
 
@@ -78,6 +79,24 @@ def after_controller_start(self, train_run_context: "TrainRunContext"):
         before the control loop starts executing."""
         pass
 
+    def on_controller_start_worker_group(
+        self, *, scaling_config: ScalingConfig, num_workers: int
+    ) -> Optional[Dict[str, str]]:
+        """Called by the TrainController before the worker group is started.
+
+        This hook can be used to perform setup that modifies the worker group's
+        placement, such as reserving an accelerator slice.
+
+        Args:
+            scaling_config: The scaling configuration for the run.
+            num_workers: The number of workers to be started.
+
+        Returns:
+            An optional dictionary defining a `bundle_label_selector`
+            to gang schedule the worker group on the reserved TPU slice.
+        """
+        return None
+
     def before_controller_shutdown(self):
         """Called before `TrainController.run` exits,
         after the control loop has exited."""
diff --git a/python/ray/train/v2/_internal/execution/controller/controller.py b/python/ray/train/v2/_internal/execution/controller/controller.py
@@ -9,9 +9,6 @@
 
 import ray
 import ray._private.ray_constants as ray_constants
-from ray.train.v2._internal.callbacks.accelerators import (
-    reserve_tpu_slice,
-)
 from ray.train.v2._internal.constants import (
     DEFAULT_ENABLE_CONTROLLER_LOGGING,
     DEFAULT_HEALTH_CHECK_INTERVAL_S,
@@ -283,27 +280,29 @@ def _start_worker_group(
             ControllerError if the worker group failed to start.
         """
         placement_strategy = self._scaling_policy.scaling_config.placement_strategy
-        placement_group = None
-        backend_config = self._train_run_context.backend_config
-
-        if getattr(backend_config, "use_tpu", False):
-            try:
-                placement_group = reserve_tpu_slice(
-                    num_workers=num_workers,
-                    resources_per_worker=resources_per_worker,
-                    topology=getattr(backend_config, "topology", None),
-                    accelerator_type=getattr(backend_config, "accelerator_type", None),
-                )
-            except Exception as e:
-                return ControllerError(e)
+        scaling_config = self._train_run_context.scaling_config
+
+        # Check for `bundle_label_selector` to influence WorkerGroup scheduling.
+        bundle_label_selector = None
+        try:
+            for callback in self._callbacks:
+                if hasattr(callback, "on_controller_start_worker_group"):
+                    selector = callback.on_controller_start_worker_group(
+                        scaling_config=scaling_config, num_workers=num_workers
+                    )
+                    if selector:
+                        bundle_label_selector = selector
+                        break
+        except Exception as e:
+            return ControllerError(e)
 
         worker_group_context = WorkerGroupContext(
             run_attempt_id=self._get_run_attempt_id(),
             train_fn_ref=self._train_fn_ref,
             num_workers=num_workers,
             resources_per_worker=resources_per_worker,
             placement_strategy=placement_strategy,
-            placement_group=placement_group,
+            bundle_label_selector=bundle_label_selector,
         )
         try:
             self._worker_group = self.worker_group_cls.create(
diff --git a/python/ray/train/v2/_internal/execution/worker_group/worker_group.py b/python/ray/train/v2/_internal/execution/worker_group/worker_group.py
@@ -89,15 +89,15 @@ class WorkerGroupContext:
         num_workers: The number of workers in the worker group.
         resources_per_worker: The resources per worker.
         placement_strategy: Strategy for placing workers.
-        placement_group: Optional override placement group to schedule workers to.
+        bundle_label_selector: Optional label selectors to apply per-bundle for workers.
     """
 
     run_attempt_id: str
     train_fn_ref: ObjectRefWrapper[Callable[[], None]]
     num_workers: int
     resources_per_worker: Dict[str, float]
     placement_strategy: str = "PACK"
-    placement_group: Optional[PlacementGroup] = None
+    bundle_label_selector: Optional[Dict[str, str]] = None
 
 
 class WorkerGroup:
@@ -255,7 +255,6 @@ def _start_impl(
         """
         self._assert_inactive()
         worker_group_context = self._worker_group_context
-        pg = worker_group_context.placement_group
 
         WorkerGroup._check_cluster_resources_and_raise_if_insufficient(
             worker_group_context.resources_per_worker,
@@ -271,12 +270,19 @@ def _start_impl(
             for callback in self._callbacks:
                 callback.before_worker_group_start(worker_group_context)
 
-            if pg is None:
-                pg = placement_group(
-                    bundles=[worker_group_context.resources_per_worker]
-                    * worker_group_context.num_workers,
-                    strategy=worker_group_context.placement_strategy,
-                )
+            bundle_label_selector = (
+                [worker_group_context.bundle_label_selector.copy()]
+                * worker_group_context.num_workers
+                if worker_group_context.bundle_label_selector
+                else None
+            )
+
+            pg = placement_group(
+                bundles=[worker_group_context.resources_per_worker]
+                * worker_group_context.num_workers,
+                strategy=worker_group_context.placement_strategy,
+                bundle_label_selector=bundle_label_selector,
+            )
             logger.info(
                 f"Attempting to start training worker group of size {worker_group_context.num_workers} with "
                 f"the following resources: [{worker_group_context.resources_per_worker}] * {worker_group_context.num_workers}"
diff --git a/python/ray/train/v2/jax/tpu_utils.py b/python/ray/train/v2/jax/tpu_utils.py
diff --git a/python/ray/train/v2/tests/test_tpu_utils.py b/python/ray/train/v2/tests/test_tpu_utils.py