Merge branch 'master' into rushikesh/remove-read-parquet-bulk-api

rushikeshadhav · web-flow · commit bbfae94d1ecf · 2025-11-25T17:37:01.000+05:30
diff --git a/doc/source/rllib/rllib-algorithms.rst b/doc/source/rllib/rllib-algorithms.rst
@@ -173,9 +173,9 @@ Asynchronous Proximal Policy Optimization (APPO)
 
 .. tip::
 
-    APPO was originally `published under the name "IMPACT" <https://arxiv.org/abs/1707.06347>`__. RLlib's APPO exactly matches the algorithm described in the paper.
+    APPO was originally `published under the name "IMPACT" <https://arxiv.org/abs/1912.00167>`__. RLlib's APPO exactly matches the algorithm described in the paper.
 
-`[paper] <https://arxiv.org/abs/1707.06347>`__
+`[paper] <https://arxiv.org/abs/1912.00167>`__
 `[implementation] <https://github.com/ray-project/ray/blob/master/rllib/algorithms/appo/appo.py>`__
 
 .. figure:: images/algos/appo-architecture.svg
diff --git a/python/ray/data/_internal/execution/interfaces/physical_operator.py b/python/ray/data/_internal/execution/interfaces/physical_operator.py
@@ -343,7 +343,7 @@ def __init__(
         self._in_task_output_backpressure = False
         self._estimated_num_output_bundles = None
         self._estimated_output_num_rows = None
-        self._execution_finished = False
+        self._is_execution_marked_finished = False
         # The LogicalOperator(s) which were translated to create this PhysicalOperator.
         # Set via `PhysicalOperator.set_logical_operators()`.
         self._logical_operators: List[LogicalOperator] = []
@@ -401,48 +401,51 @@ def override_target_max_block_size(self, target_max_block_size: Optional[int]):
 
     def mark_execution_finished(self):
         """Manually mark that this operator has finished execution."""
-        self._execution_finished = True
+        self._is_execution_marked_finished = True
 
-    def execution_finished(self) -> bool:
+    def has_execution_finished(self) -> bool:
         """Return True when this operator has finished execution.
 
         The outputs may or may not have been taken.
         """
-        return self._execution_finished
+        from ..operators.base_physical_operator import InternalQueueOperatorMixin
+
+        internal_input_queue_num_blocks = 0
+        if isinstance(self, InternalQueueOperatorMixin):
+            internal_input_queue_num_blocks = self.internal_input_queue_num_blocks()
+
+        # NOTE: Execution is considered finished if
+        #   - The operator was explicitly marked finished OR
+        #   - The following auto-completion conditions are met
+        #       - All input blocks have been ingested
+        #       - Internal queue is empty
+        #       - There are no active or pending tasks
+
+        return self._is_execution_marked_finished or (
+            self._inputs_complete
+            and self.num_active_tasks() == 0
+            and internal_input_queue_num_blocks == 0
+        )
 
     def completed(self) -> bool:
         """Returns whether this operator has been fully completed.
 
         An operator is completed iff:
-            * The operator has finished execution (i.e., `execution_finished()` is True).
+            * The operator has finished execution (i.e., `has_execution_finished()` is True).
             * All outputs have been taken (i.e., `has_next()` is False) from it.
         """
         from ..operators.base_physical_operator import InternalQueueOperatorMixin
 
-        internal_input_queue_num_blocks = 0
         internal_output_queue_num_blocks = 0
         if isinstance(self, InternalQueueOperatorMixin):
-            internal_input_queue_num_blocks = self.internal_input_queue_num_blocks()
             internal_output_queue_num_blocks = self.internal_output_queue_num_blocks()
 
-        if not self._execution_finished:
-            if (
-                self._inputs_complete
-                and internal_input_queue_num_blocks == 0
-                and self.num_active_tasks() == 0
-            ):
-                # NOTE: Operator is considered completed iff
-                #   - All input blocks have been ingested
-                #   - Internal queue is empty
-                #   - There are no active or pending tasks
-                self._execution_finished = True
-
         # NOTE: We check for (internal_output_queue_size == 0) and
         # (not self.has_next()) because _OrderedOutputQueue can
         # return False for self.has_next(), but have a non-empty queue size.
         # Draining the internal output queue is important to free object refs.
         return (
-            self._execution_finished
+            self.has_execution_finished()
             and not self.has_next()
             and internal_output_queue_num_blocks == 0
         )
diff --git a/python/ray/data/_internal/execution/operators/limit_operator.py b/python/ray/data/_internal/execution/operators/limit_operator.py
@@ -117,7 +117,7 @@ def get_stats(self) -> StatsDict:
     def num_outputs_total(self) -> Optional[int]:
         # Before execution is completed, we don't know how many output
         # bundles we will have. We estimate based off the consumption so far.
-        if self._execution_finished:
+        if self.has_execution_finished():
             return self._cur_output_bundles
         return self._estimated_num_output_bundles
 
diff --git a/python/ray/data/_internal/execution/resource_manager.py b/python/ray/data/_internal/execution/resource_manager.py
@@ -341,7 +341,7 @@ def is_op_eligible(self, op: PhysicalOperator) -> bool:
             not op.throttling_disabled()
             # As long as the op has finished execution, even if there are still
             # non-taken outputs, we don't need to allocate resources for it.
-            and not op.execution_finished()
+            and not op.has_execution_finished()
         )
 
     def get_eligible_ops(self) -> List[PhysicalOperator]:
@@ -553,7 +553,7 @@ def _is_op_eligible(op: PhysicalOperator) -> bool:
             not op.throttling_disabled()
             # As long as the op has finished execution, even if there are still
             # non-taken outputs, we don't need to allocate resources for it.
-            and not op.execution_finished()
+            and not op.has_execution_finished()
         )
 
     def _get_downstream_eligible_ops(
@@ -674,9 +674,9 @@ def _get_ineligible_ops_with_usage(self) -> List[PhysicalOperator]:
         ops_to_exclude_from_reservation = []
         # Traverse operator tree collecting all operators that have already finished
         for op in self._topology:
-            if not op.execution_finished():
+            if not op.has_execution_finished():
                 for dep in op.input_dependencies:
-                    if dep.execution_finished():
+                    if dep.has_execution_finished():
                         last_completed_ops.append(dep)
 
         # In addition to completed operators,
diff --git a/python/ray/data/_internal/execution/streaming_executor.py b/python/ray/data/_internal/execution/streaming_executor.py
@@ -652,7 +652,7 @@ def _get_state_dict(self, state):
                     "total_rows": op.num_output_rows_total(),
                     "queued_blocks": op_state.total_enqueued_input_blocks(),
                     "state": DatasetState.FINISHED.name
-                    if op.execution_finished()
+                    if op.has_execution_finished()
                     else state,
                 }
                 for i, (op, op_state) in enumerate(self._topology.items())
diff --git a/python/ray/data/_internal/execution/streaming_executor_state.py b/python/ray/data/_internal/execution/streaming_executor_state.py
@@ -672,7 +672,7 @@ def update_operator_states(topology: Topology) -> None:
         # Drain external input queue if current operator is execution finished.
         # This is needed when the limit is reached, and `mark_execution_finished`
         # is called manually.
-        if op.execution_finished():
+        if op.has_execution_finished():
             for input_queue in op_state.input_queues:
                 # Drain input queue
                 input_queue.clear()
diff --git a/python/ray/data/tests/test_operators.py b/python/ray/data/tests/test_operators.py
@@ -1055,7 +1055,7 @@ def test_limit_operator(ray_start_regular_shared):
         while input_op.has_next() and not limit_op._limit_reached():
             loop_count += 1
             assert not limit_op.completed(), limit
-            assert not limit_op._execution_finished, limit
+            assert not limit_op.has_execution_finished(), limit
             limit_op.add_input(input_op.get_next(), 0)
             while limit_op.has_next():
                 # Drain the outputs. So the limit operator
@@ -1066,12 +1066,12 @@ def test_limit_operator(ray_start_regular_shared):
                 assert limit_op.mark_execution_finished.call_count == 1, limit
                 assert limit_op.completed(), limit
                 assert limit_op._limit_reached(), limit
-                assert limit_op._execution_finished, limit
+                assert limit_op.has_execution_finished(), limit
             else:
                 assert limit_op.mark_execution_finished.call_count == 0, limit
                 assert not limit_op.completed(), limit
                 assert not limit_op._limit_reached(), limit
-                assert not limit_op._execution_finished, limit
+                assert not limit_op.has_execution_finished(), limit
         limit_op.mark_execution_finished()
         # After inputs done, the number of output bundles
         # should be the same as the number of `add_input`s.
diff --git a/python/ray/data/tests/test_streaming_executor.py b/python/ray/data/tests/test_streaming_executor.py
@@ -258,7 +258,7 @@ def test_update_operator_states_drains_upstream(ray_start_regular_shared):
 
     # Manually mark o2 as execution finished (simulating limit operator behavior)
     o2.mark_execution_finished()
-    assert o2.execution_finished(), "o2 should be execution finished"
+    assert o2.has_execution_finished(), "o2 should be execution finished"
 
     # Call update_operator_states - this should drain o1's output queue
     update_operator_states(topo)
diff --git a/rllib/env/env_runner.py b/rllib/env/env_runner.py
@@ -258,12 +258,10 @@ def _try_env_step(self, actions):
         except Exception as e:
             self.metrics.log_value(NUM_ENV_STEP_FAILURES_LIFETIME, 1, reduce="sum")
 
-            # @OldAPIStack (config.restart_failed_sub_environments)
             if self.config.restart_failed_sub_environments:
                 if not isinstance(e, StepFailedRecreateEnvError):
                     logger.exception(
-                        "Stepping the env resulted in an error! The original error "
-                        f"is: {e}"
+                        f"RLlib {self.__class__.__name__}: Environment step failed. Will force reset env(s) in this EnvRunner. The original error is: {e}"
                     )
                 # Recreate the env.
                 self.make_env()
@@ -272,11 +270,16 @@ def _try_env_step(self, actions):
                 # data and repeating the step attempt).
                 return ENV_STEP_FAILURE
             else:
-                if isinstance(e, StepFailedRecreateEnvError):
-                    raise ValueError(
-                        "Environment raised StepFailedRecreateEnvError but config.restart_failed_sub_environments is False."
-                    ) from e
-                raise e
+                logger.exception(
+                    f"RLlib {self.__class__.__name__}: Environment step failed and "
+                    "'config.restart_failed_sub_environments' is False. "
+                    "This env will not be recreated. "
+                    "Consider setting 'fault_tolerance(restart_failed_sub_environments=True)' in your AlgorithmConfig "
+                    "in order to automatically re-create and force-reset an env."
+                    f"The original error type: {type(e)}. "
+                    f"{e}"
+                )
+                raise RuntimeError from e
 
     def _convert_to_tensor(self, struct) -> TensorType:
         """Converts structs to a framework-specific tensor."""
diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py
@@ -363,9 +363,6 @@ def _sample(
             # Try stepping the environment.
             results = self._try_env_step(actions_for_env)
             if results == ENV_STEP_FAILURE:
-                logging.warning(
-                    f"RLlib {self.__class__.__name__}: Environment step failed. Will force reset env(s) in this EnvRunner."
-                )
                 return self._sample(
                     num_timesteps=num_timesteps,
                     num_episodes=num_episodes,

Original file line number	Diff line number	Diff line change
`@@ -652,7 +652,7 @@ def _get_state_dict(self, state):`
`652`	`652`	`"total_rows": op.num_output_rows_total(),`
`653`	`653`	`"queued_blocks": op_state.total_enqueued_input_blocks(),`
`654`	`654`	`"state": DatasetState.FINISHED.name`
`655`		`- if op.execution_finished()`
	`655`	`+ if op.has_execution_finished()`
`656`	`656`	`else state,`
`657`	`657`	`}`
`658`	`658`	`for i, (op, op_state) in enumerate(self._topology.items())`