ray-project · ericl · Feb 8, 2023 · Feb 8, 2023 · Feb 8, 2023 · Feb 8, 2023
@@ -150,9 +150,8 @@ class ExecutionOptions:
     # node (node driving the execution).
     locality_with_output: bool = False
 
-    # Always preserve ordering of blocks, even if using operators that
-    # don't require it.
-    preserve_order: bool = True
+    # Set this to preserve the ordering between blocks processed by operators.
+    preserve_order: bool = False
 
 
 @dataclass

@@ -3633,10 +3633,10 @@ def repeat(self, times: Optional[int] = None) -> "DatasetPipeline[T]":
         Examples:
             >>> import ray
             >>> # Infinite pipeline of numbers [0, 5)
-            >>> ray.data.range(5).repeat().take()
+            >>> ray.data.range(5, parallelism=1).repeat().take()
             [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, ...]
             >>> # Can apply transformations to the pipeline.
-            >>> ray.data.range(5).repeat().map(lambda x: -x).take()
+            >>> ray.data.range(5, parallelism=1).repeat().map(lambda x: -x).take()
             [0, -1, -2, -3, -4, 0, -1, -2, -3, -4, ...]
             >>> # Can shuffle each epoch (dataset) in the pipeline.
             >>> ray.data.range(5).repeat().random_shuffle().take() # doctest: +SKIP

@@ -152,7 +152,7 @@ def aggregate(self, *aggs: AggregateFn) -> Dataset[U]:
                     init=lambda k: [],
                     accumulate_row=lambda a, r: a + [r],
                     merge=lambda a1, a2: a1 + a2,
-                    finalize=lambda a: a
+                    finalize=lambda a: sorted(a)
                 ))
                 result.show()
 

@@ -2381,7 +2381,10 @@ def test_select_columns(ray_start_regular_shared):
         ds3.select_columns(cols=[]).fully_executed()
 
 
-def test_map_batches_basic(ray_start_regular_shared, tmp_path):
+def test_map_batches_basic(ray_start_regular_shared, tmp_path, restore_dataset_context):
+    ctx = DatasetContext.get_current()
+    ctx.execution_options.preserve_order = True
+
     # Test input validation
     ds = ray.data.range(5)
     with pytest.raises(ValueError):
@@ -2710,8 +2713,11 @@ def test_map_batches_actors_preserves_order(ray_start_regular_shared):
     ],
 )
 def test_map_batches_batch_mutation(
-    ray_start_regular_shared, num_rows, num_blocks, batch_size
+    ray_start_regular_shared, num_rows, num_blocks, batch_size, restore_dataset_context
 ):
+    ctx = DatasetContext.get_current()
+    ctx.execution_options.preserve_order = True
+
     # Test that batch mutation works without encountering a read-only error (e.g. if the
     # batch is a zero-copy view on data in the object store).
     def mutate(df):
@@ -4832,7 +4838,9 @@ def test_random_block_order_schema(ray_start_regular_shared):
     ds.schema().names == ["a", "b"]
 
 
-def test_random_block_order(ray_start_regular_shared):
+def test_random_block_order(ray_start_regular_shared, restore_dataset_context):
+    ctx = DatasetContext.get_current()
+    ctx.execution_options.preserve_order = True
 
     # Test BlockList.randomize_block_order.
     ds = ray.data.range(12).repartition(4)

@@ -38,7 +38,7 @@ def ref_bundles_to_list(bundles: List[RefBundle]) -> List[List[Any]]:
 
 
 def test_pipelined_execution(ray_start_10_cpus_shared):
-    executor = StreamingExecutor(ExecutionOptions())
+    executor = StreamingExecutor(ExecutionOptions(preserve_order=True))
     inputs = make_ref_bundles([[x] for x in range(20)])
     o1 = InputDataBuffer(inputs)
     o2 = MapOperator.create(make_transform(lambda block: [b * -1 for b in block]), o1)

@@ -14,6 +14,7 @@
 from ray.air import session
 from ray.air.checkpoint import Checkpoint
 from ray.air.constants import MAX_REPR_LENGTH
+from ray.data.context import DatasetContext
 from ray.data.preprocessor import Preprocessor
 from ray.data.preprocessors import BatchMapper
 from ray.tune.impl import tuner_internal
@@ -97,6 +98,9 @@ def training_loop(self):
 
 
 def test_preprocess_datasets(ray_start_4_cpus):
+    ctx = DatasetContext.get_current()
+    ctx.execution_options.preserve_order = True
+
     def training_loop(self):
         assert self.datasets["my_dataset"].take() == [2, 3, 4]