-
Notifications
You must be signed in to change notification settings - Fork 7k
[Data] Sample finalized partitions randomly to avoid lensing finalization on a single node #58456
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |||||||||||||
| import itertools | ||||||||||||||
| import logging | ||||||||||||||
| import math | ||||||||||||||
| import random | ||||||||||||||
| import threading | ||||||||||||||
| import time | ||||||||||||||
| from collections import defaultdict, deque | ||||||||||||||
|
|
@@ -16,6 +17,7 @@ | |||||||||||||
| Dict, | ||||||||||||||
| List, | ||||||||||||||
| Optional, | ||||||||||||||
| Set, | ||||||||||||||
| Tuple, | ||||||||||||||
| Union, | ||||||||||||||
| ) | ||||||||||||||
|
|
@@ -601,8 +603,10 @@ def __init__( | |||||||||||||
| # aggregators (keeps track which input sequences have already broadcasted | ||||||||||||||
| # their schemas) | ||||||||||||||
| self._has_schemas_broadcasted: DefaultDict[int, bool] = defaultdict(bool) | ||||||||||||||
| # Id of the last partition finalization of which had already been scheduled | ||||||||||||||
| self._last_finalized_partition_id: int = -1 | ||||||||||||||
| # Set of partitions still pending finalization | ||||||||||||||
| self._pending_finalization_partition_ids: Set[int] = set( | ||||||||||||||
| range(target_num_partitions) | ||||||||||||||
| ) | ||||||||||||||
|
|
||||||||||||||
| self._output_queue: Deque[RefBundle] = deque() | ||||||||||||||
|
|
||||||||||||||
|
|
@@ -823,11 +827,6 @@ def _try_finalize(self): | |||||||||||||
| if not self._is_shuffling_done(): | ||||||||||||||
| return | ||||||||||||||
|
|
||||||||||||||
| logger.debug( | ||||||||||||||
| f"Scheduling next shuffling finalization batch (last finalized " | ||||||||||||||
| f"partition id is {self._last_finalized_partition_id})" | ||||||||||||||
| ) | ||||||||||||||
|
|
||||||||||||||
| def _on_bundle_ready(partition_id: int, bundle: RefBundle): | ||||||||||||||
| # Add finalized block to the output queue | ||||||||||||||
| self._output_queue.append(bundle) | ||||||||||||||
|
|
@@ -872,10 +871,8 @@ def _on_aggregation_done(partition_id: int, exc: Optional[Exception]): | |||||||||||||
| or self._aggregator_pool.num_aggregators | ||||||||||||||
| ) | ||||||||||||||
|
|
||||||||||||||
| num_remaining_partitions = ( | ||||||||||||||
| self._num_partitions - 1 - self._last_finalized_partition_id | ||||||||||||||
| ) | ||||||||||||||
| num_running_finalizing_tasks = len(self._finalizing_tasks) | ||||||||||||||
| num_remaining_partitions = len(self._pending_finalization_partition_ids) | ||||||||||||||
|
|
||||||||||||||
| # Finalization is executed in batches of no more than | ||||||||||||||
| # `DataContext.max_hash_shuffle_finalization_batch_size` tasks at a time. | ||||||||||||||
|
|
@@ -899,12 +896,21 @@ def _on_aggregation_done(partition_id: int, exc: Optional[Exception]): | |||||||||||||
| if next_batch_size == 0: | ||||||||||||||
| return | ||||||||||||||
|
|
||||||||||||||
| # Next partition to be scheduled for finalization is the one right | ||||||||||||||
| # after the last one scheduled | ||||||||||||||
| next_partition_id = self._last_finalized_partition_id + 1 | ||||||||||||||
|
|
||||||||||||||
| target_partition_ids = list( | ||||||||||||||
| range(next_partition_id, next_partition_id + next_batch_size) | ||||||||||||||
| # We're sampling randomly next set of partitions to be finalized | ||||||||||||||
| # to distribute finalization window uniformly across the nodes of the cluster | ||||||||||||||
| # and avoid effect of "sliding lense" effect where we finalize the batch of | ||||||||||||||
| # N *adjacent* partitions that may be co-located on the same node: | ||||||||||||||
| # | ||||||||||||||
| # - Adjacent partitions i and i+1 are handled by adjacent | ||||||||||||||
| # aggregators (since membership is determined as i % num_aggregators) | ||||||||||||||
| # | ||||||||||||||
| # - Adjacent aggregators have high likelihood of running on the | ||||||||||||||
| # same node (when num aggregators > num nodes) | ||||||||||||||
|
Comment on lines
+907
to
+908
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this necessarily true? your default strategy is spread, and each aggregator is scheduled with same num of resources, so aggregator i and i + 1 have as much of a chance of scheduling on the same node as aggregator i and j. please correct my assumptions if im wrong |
||||||||||||||
| # | ||||||||||||||
| # NOTE: This doesn't affect determinism, since this only impacts order | ||||||||||||||
| # of finalization (hence not required to be seeded) | ||||||||||||||
| target_partition_ids = random.sample( | ||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So wouldn't a better strategy be to check how much each agg actor is currently consuming relative to the node's capacity and schedule the finalization if there's remaining capacity? I just find the randomization strategy harder to reason in this case.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also it's a function of partition size, so ideally if we can get metadata about the partition before scheduling the |
||||||||||||||
| list(self._pending_finalization_partition_ids), next_batch_size | ||||||||||||||
| ) | ||||||||||||||
|
Comment on lines
+912
to
914
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Suggested change
|
||||||||||||||
|
|
||||||||||||||
| logger.debug( | ||||||||||||||
|
|
@@ -941,15 +947,15 @@ def _on_aggregation_done(partition_id: int, exc: Optional[Exception]): | |||||||||||||
| ), | ||||||||||||||
| ) | ||||||||||||||
|
|
||||||||||||||
| # Pop partition id from remaining set | ||||||||||||||
| self._pending_finalization_partition_ids.remove(partition_id) | ||||||||||||||
|
|
||||||||||||||
| # Update Finalize Metrics on task submission | ||||||||||||||
| # NOTE: This is empty because the input is directly forwarded from the | ||||||||||||||
| # output of the shuffling stage, which we don't return. | ||||||||||||||
| empty_bundle = RefBundle([], schema=None, owns_blocks=False) | ||||||||||||||
| self.reduce_metrics.on_task_submitted(partition_id, empty_bundle) | ||||||||||||||
|
|
||||||||||||||
| # Update last finalized partition id | ||||||||||||||
| self._last_finalized_partition_id = max(target_partition_ids) | ||||||||||||||
|
|
||||||||||||||
| def _do_shutdown(self, force: bool = False) -> None: | ||||||||||||||
| self._aggregator_pool.shutdown(force=True) | ||||||||||||||
| # NOTE: It's critical for Actor Pool to release actors before calling into | ||||||||||||||
|
|
@@ -1021,7 +1027,7 @@ def implements_accurate_memory_accounting(self) -> bool: | |||||||||||||
| return True | ||||||||||||||
|
|
||||||||||||||
| def _is_finalized(self): | ||||||||||||||
| return self._last_finalized_partition_id == self._num_partitions - 1 | ||||||||||||||
| return len(self._pending_finalization_partition_ids) == 0 | ||||||||||||||
|
|
||||||||||||||
| def _handle_shuffled_block_metadata( | ||||||||||||||
| self, | ||||||||||||||
|
|
||||||||||||||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wait is this true? if module N = num actors, then partition i and i + 1 must necessarily be in different actors.Oh wait nvm, i see what your saying