ray-project · alexeykudinkin · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025
@@ -3,6 +3,7 @@
 import itertools
 import logging
 import math
+import random
 import threading
 import time
 from collections import defaultdict, deque
@@ -16,6 +17,7 @@
     Dict,
     List,
     Optional,
+    Set,
     Tuple,
     Union,
 )
@@ -601,8 +603,10 @@ def __init__(
         # aggregators (keeps track which input sequences have already broadcasted
         # their schemas)
         self._has_schemas_broadcasted: DefaultDict[int, bool] = defaultdict(bool)
-        # Id of the last partition finalization of which had already been scheduled
-        self._last_finalized_partition_id: int = -1
+        # Set of partitions still pending finalization
+        self._pending_finalization_partition_ids: Set[int] = set(
+            range(target_num_partitions)
+        )
 
         self._output_queue: Deque[RefBundle] = deque()
 
@@ -823,11 +827,6 @@ def _try_finalize(self):
         if not self._is_shuffling_done():
             return
 
-        logger.debug(
-            f"Scheduling next shuffling finalization batch (last finalized "
-            f"partition id is {self._last_finalized_partition_id})"
-        )
-
         def _on_bundle_ready(partition_id: int, bundle: RefBundle):
             # Add finalized block to the output queue
             self._output_queue.append(bundle)
@@ -872,10 +871,8 @@ def _on_aggregation_done(partition_id: int, exc: Optional[Exception]):
             or self._aggregator_pool.num_aggregators
         )
 
-        num_remaining_partitions = (
-            self._num_partitions - 1 - self._last_finalized_partition_id
-        )
         num_running_finalizing_tasks = len(self._finalizing_tasks)
+        num_remaining_partitions = len(self._pending_finalization_partition_ids)
 
         # Finalization is executed in batches of no more than
         # `DataContext.max_hash_shuffle_finalization_batch_size` tasks at a time.
@@ -899,12 +896,21 @@ def _on_aggregation_done(partition_id: int, exc: Optional[Exception]):
         if next_batch_size == 0:
             return
 
-        # Next partition to be scheduled for finalization is the one right
-        # after the last one scheduled
-        next_partition_id = self._last_finalized_partition_id + 1
-
-        target_partition_ids = list(
-            range(next_partition_id, next_partition_id + next_batch_size)
+        # We're sampling randomly next set of partitions to be finalized
+        # to distribute finalization window uniformly across the nodes of the cluster
+        # and avoid effect of "sliding lense" effect where we finalize the batch of
+        # N *adjacent* partitions that may be co-located on the same node:
+        #
+        #   - Adjacent partitions i and i+1 are handled by adjacent
+        #   aggregators (since membership is determined as i % num_aggregators)
+        #
+        #   - Adjacent aggregators have high likelihood of running on the
+        #   same node (when num aggregators > num nodes)
+        #
+        # NOTE: This doesn't affect determinism, since this only impacts order
+        #       of finalization (hence not required to be seeded)
+        target_partition_ids = random.sample(
+            list(self._pending_finalization_partition_ids), next_batch_size
         )
-        target_partition_ids = random.sample(
-            list(self._pending_finalization_partition_ids), next_batch_size
-        )
+        target_partition_ids = random.sample(
+            self._pending_finalization_partition_ids, next_batch_size
+        )
-        target_partition_ids = random.sample(
-            list(self._pending_finalization_partition_ids), next_batch_size
-        )
+        target_partition_ids = random.sample(
+            self._pending_finalization_partition_ids, next_batch_size
+        )
 
         logger.debug(
@@ -941,15 +947,15 @@ def _on_aggregation_done(partition_id: int, exc: Optional[Exception]):
                 ),
             )
 
+            # Pop partition id from remaining set
+            self._pending_finalization_partition_ids.remove(partition_id)
+
             # Update Finalize Metrics on task submission
             # NOTE: This is empty because the input is directly forwarded from the
             # output of the shuffling stage, which we don't return.
             empty_bundle = RefBundle([], schema=None, owns_blocks=False)
             self.reduce_metrics.on_task_submitted(partition_id, empty_bundle)
 
-        # Update last finalized partition id
-        self._last_finalized_partition_id = max(target_partition_ids)
-
     def _do_shutdown(self, force: bool = False) -> None:
         self._aggregator_pool.shutdown(force=True)
         # NOTE: It's critical for Actor Pool to release actors before calling into
@@ -1021,7 +1027,7 @@ def implements_accurate_memory_accounting(self) -> bool:
         return True
 
     def _is_finalized(self):
-        return self._last_finalized_partition_id == self._num_partitions - 1
+        return len(self._pending_finalization_partition_ids) == 0
 
     def _handle_shuffled_block_metadata(
         self,

@@ -166,6 +166,7 @@ def _preprocess(
         left_seq_partition: pa.Table = self._get_partition_builder(
             input_seq_id=0, partition_id=partition_id
         ).build()
+
         right_seq_partition: pa.Table = self._get_partition_builder(
             input_seq_id=1, partition_id=partition_id
         ).build()
@@ -198,7 +199,6 @@ def _preprocess(
         should_index_r = self._should_index_side("right", supported_r, unsupported_r)
 
         # Add index columns for back-referencing if we have unsupported columns
-        # TODO: what are the chances of a collision with the index column?
         if should_index_l:
             supported_l = self._append_index_column(
                 table=supported_l, col_name=self._index_name("left")
@@ -246,7 +246,7 @@ def _postprocess(
         return supported
 
     def _index_name(self, suffix: str) -> str:
-        return f"__ray_data_index_level_{suffix}__"
+        return f"__rd_index_level_{suffix}__"
 
     def clear(self, partition_id: int):
         self._left_input_seq_partition_builders.pop(partition_id)
@@ -263,9 +263,6 @@ def _get_partition_builder(self, *, input_seq_id: int, partition_id: int):
             )
         return partition_builder
 
-    def _get_index_col_name(self, index: int) -> str:
-        return f"__index_level_{index}__"
-
     def _should_index_side(
         self, side: str, supported_table: "pa.Table", unsupported_table: "pa.Table"
     ) -> bool:
@@ -318,9 +315,8 @@ def _split_unsupported_columns(
         """
         supported, unsupported = [], []
         for idx in range(len(table.columns)):
-            column: "pa.ChunkedArray" = table.column(idx)
-
-            col_type = column.type
+            col: "pa.ChunkedArray" = table.column(idx)
+            col_type: "pa.DataType" = col.type
 
             if _is_pa_extension_type(col_type) or self._is_pa_join_not_supported(
                 col_type
@@ -329,7 +325,7 @@ def _split_unsupported_columns(
             else:
                 supported.append(idx)
 
-        return (table.select(supported), table.select(unsupported))
+        return table.select(supported), table.select(unsupported)
 
     def _add_back_unsupported_columns(
         self,