From bde7f33c4743ec8bb6b01767cceab13dff98a958 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sat, 22 Jun 2024 09:20:30 +0200
Subject: [PATCH 1/3] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 .../yaml_files/bc/bc-halfcheetah-v4.yaml      |  1 -
 .../yaml_files/cql/cql-halfcheetah-v4.yaml    |  1 -
 .../marwil/marwil-halfcheetah-v4.yaml         |  1 -
 rllib/algorithms/algorithm.py                 | 40 +------------------
 rllib/algorithms/algorithm_config.py          | 23 ++++-------
 rllib/algorithms/cql/tests/test_cql.py        | 12 +++---
 rllib/algorithms/marwil/tests/test_marwil.py  |  1 -
 rllib/algorithms/tests/test_algorithm.py      |  8 ++--
 .../evaluation_parallel_to_training.py        |  5 ---
 9 files changed, 18 insertions(+), 74 deletions(-)

diff --git a/release/rllib_tests/learning_tests/yaml_files/bc/bc-halfcheetah-v4.yaml b/release/rllib_tests/learning_tests/yaml_files/bc/bc-halfcheetah-v4.yaml
index 199022e32d99..df0ebbf6a3b8 100644
--- a/release/rllib_tests/learning_tests/yaml_files/bc/bc-halfcheetah-v4.yaml
+++ b/release/rllib_tests/learning_tests/yaml_files/bc/bc-halfcheetah-v4.yaml
@@ -25,7 +25,6 @@ bc-halfcheetah-v0:
         evaluation_interval: 3
         evaluation_config:
             input: sampler
-        always_attach_evaluation_results: True
         num_workers: 8
         lr: 0.001
         grad_clip: 40
diff --git a/release/rllib_tests/learning_tests/yaml_files/cql/cql-halfcheetah-v4.yaml b/release/rllib_tests/learning_tests/yaml_files/cql/cql-halfcheetah-v4.yaml
index 32b7299b9f7f..67295091800f 100644
--- a/release/rllib_tests/learning_tests/yaml_files/cql/cql-halfcheetah-v4.yaml
+++ b/release/rllib_tests/learning_tests/yaml_files/cql/cql-halfcheetah-v4.yaml
@@ -53,4 +53,3 @@ cql-halfcheetah-v4:
         evaluation_duration_unit: episodes
         evaluation_config:
             input: sampler
-        always_attach_evaluation_results: True
diff --git a/release/rllib_tests/learning_tests/yaml_files/marwil/marwil-halfcheetah-v4.yaml b/release/rllib_tests/learning_tests/yaml_files/marwil/marwil-halfcheetah-v4.yaml
index 59ff10051cfb..f62a95f5144d 100644
--- a/release/rllib_tests/learning_tests/yaml_files/marwil/marwil-halfcheetah-v4.yaml
+++ b/release/rllib_tests/learning_tests/yaml_files/marwil/marwil-halfcheetah-v4.yaml
@@ -29,7 +29,6 @@ marwil-halfcheetah-v4:
         evaluation_config:
             input: sampler
             off_policy_estimation_methods: null
-        always_attach_evaluation_results: True
         num_workers: 8
         lr: 0.001
         grad_clip: 40
diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index cd4a1f8c8a08..39824b8c56f4 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -541,19 +541,6 @@ def default_logger_creator(config):
         self.evaluation_config: Optional[AlgorithmConfig] = None
         # Evaluation EnvRunnerGroup and metrics last returned by `self.evaluate()`.
         self.evaluation_workers: Optional[EnvRunnerGroup] = None
-        # Initialize common evaluation_metrics to nan, before they become
-        # available. We want to make sure the metrics are always present
-        # (although their values may be nan), so that Tune doesn't complain
-        # when we use these as stopping criteria.
-        self.evaluation_metrics = {
-            EVALUATION_RESULTS: {
-                ENV_RUNNER_RESULTS: {
-                    EPISODE_RETURN_MAX: np.nan,
-                    EPISODE_RETURN_MIN: np.nan,
-                    EPISODE_RETURN_MEAN: np.nan,
-                },
-            },
-        }
 
         super().__init__(
             config=config,
@@ -849,8 +836,6 @@ def step(self) -> ResultDict:
             self.config.evaluation_interval
             and (self.iteration + 1) % self.config.evaluation_interval == 0
         )
-        evaluate_in_general = bool(self.config.evaluation_interval)
-
         # Results dict for training (and if appolicable: evaluation).
         train_results: ResultDict = {}
         eval_results: ResultDict = {}
@@ -886,19 +871,6 @@ def step(self) -> ResultDict:
         if evaluate_this_iter and not self.config.evaluation_parallel_to_training:
             eval_results = self._run_one_evaluation(parallel_train_future=None)
 
-        # Attach latest available evaluation results to train results, if necessary.
-        if (
-            evaluate_in_general
-            and not evaluate_this_iter
-            and self.config.always_attach_evaluation_results
-        ):
-            if not isinstance(self.evaluation_metrics, dict):
-                raise ValueError(
-                    "Algorithm.evaluate() needs to return a ResultDict, but returned "
-                    f"{self.evaluation_metrics}!"
-                )
-            eval_results = self.evaluation_metrics
-
         # Sync EnvRunner workers.
         # TODO (sven): For the new API stack, the common execution pattern for any algo
         #  should be: [sample + get_metrics + get_state] -> send all these in one remote
@@ -3219,17 +3191,7 @@ def _run_one_evaluation(
                 "num_remote_worker_restarts"
             ] = self.evaluation_workers.num_remote_worker_restarts()
 
-        # Evaluation does not run for every step.
-        # Save evaluation metrics on Algorithm, so it can be attached to
-        # subsequent step results as latest evaluation result.
-        self.evaluation_metrics = {EVALUATION_RESULTS: eval_results}
-        # To make the old stack forward compatible with the new API stack metrics
-        # structure, we add everything under the new key (EVALUATION_RESULTS) as well as
-        # the old one ("evaluation").
-        if not self.config.enable_env_runner_and_connector_v2:
-            self.evaluation_metrics["evaluation"] = eval_results
-
-        return self.evaluation_metrics
+        return {EVALUATION_RESULTS: eval_results}
 
     def _run_one_training_iteration_and_evaluation_in_parallel(
         self,
diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index dccf062a22fb..4a2603378be1 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -452,7 +452,6 @@ def __init__(self, algo_class: Optional[type] = None):
         self.ope_split_batch_by_episode = True
         self.evaluation_num_env_runners = 0
         self.custom_evaluation_function = None
-        self.always_attach_evaluation_results = True
         # TODO: Set this flag still in the config or - much better - in the
         #  RolloutWorker as a property.
         self.in_evaluation = False
@@ -547,6 +546,7 @@ def __init__(self, algo_class: Optional[type] = None):
         self._enable_rl_module_api = DEPRECATED_VALUE
         self.auto_wrap_old_gym_envs = DEPRECATED_VALUE
         self.disable_env_checking = DEPRECATED_VALUE
+        self.always_attach_evaluation_results = DEPRECATED_VALUE
 
         # The following values have moved because of the new ReplayBuffer API
         self.buffer_size = DEPRECATED_VALUE
@@ -2212,9 +2212,8 @@ def evaluation(
         ope_split_batch_by_episode: Optional[bool] = NotProvided,
         evaluation_num_env_runners: Optional[int] = NotProvided,
         custom_evaluation_function: Optional[Callable] = NotProvided,
-        always_attach_evaluation_results: Optional[bool] = NotProvided,
         # Deprecated args.
-        evaluation_num_episodes=DEPRECATED_VALUE,
+        always_attach_evaluation_results=DEPRECATED_VALUE,
         evaluation_num_workers=DEPRECATED_VALUE,
     ) -> "AlgorithmConfig":
         """Sets the config's evaluation settings.
@@ -2300,28 +2299,24 @@ def evaluation(
                 metrics: dict. See the Algorithm.evaluate() method to see the default
                 implementation. The Algorithm guarantees all eval workers have the
                 latest policy state before this function is called.
-            always_attach_evaluation_results: Make sure the latest available evaluation
-                results are always attached to a step result dict. This may be useful
-                if Tune or some other meta controller needs access to evaluation metrics
-                all the time.
 
         Returns:
             This updated AlgorithmConfig object.
         """
-        if evaluation_num_episodes != DEPRECATED_VALUE:
+        if always_attach_evaluation_results != DEPRECATED_VALUE:
             deprecation_warning(
-                old="AlgorithmConfig.evaluation(evaluation_num_episodes=..)",
-                new="AlgorithmConfig.evaluation(evaluation_duration=.., "
-                "evaluation_duration_unit='episodes')",
+                old="AlgorithmConfig.evaluation(always_attach_evaluation_results=..)",
+                help="This setting is no longer needed, b/c Tune does not error "
+                "anymore (only warns) when a metrics key can't be found in the "
+                "results.",
                 error=True,
             )
         if evaluation_num_workers != DEPRECATED_VALUE:
             deprecation_warning(
                 old="AlgorithmConfig.evaluation(evaluation_num_workers=..)",
                 new="AlgorithmConfig.evaluation(evaluation_num_env_runners=..)",
-                error=False,
+                error=True,
             )
-            self.evaluation_num_env_runners = evaluation_num_workers
 
         if evaluation_interval is not NotProvided:
             self.evaluation_interval = evaluation_interval
@@ -2360,8 +2355,6 @@ def evaluation(
             self.evaluation_num_env_runners = evaluation_num_env_runners
         if custom_evaluation_function is not NotProvided:
             self.custom_evaluation_function = custom_evaluation_function
-        if always_attach_evaluation_results is not NotProvided:
-            self.always_attach_evaluation_results = always_attach_evaluation_results
         if ope_split_batch_by_episode is not NotProvided:
             self.ope_split_batch_by_episode = ope_split_batch_by_episode
 
diff --git a/rllib/algorithms/cql/tests/test_cql.py b/rllib/algorithms/cql/tests/test_cql.py
index 063c42c04060..b658b0c323c7 100644
--- a/rllib/algorithms/cql/tests/test_cql.py
+++ b/rllib/algorithms/cql/tests/test_cql.py
@@ -64,7 +64,6 @@ def test_cql_compilation(self):
                 bc_iters=2,
             )
             .evaluation(
-                always_attach_evaluation_results=True,
                 evaluation_interval=2,
                 evaluation_duration=10,
                 evaluation_config=cql.CQLConfig.overrides(input_="sampler"),
@@ -83,11 +82,12 @@ def test_cql_compilation(self):
                 results = algo.train()
                 check_train_results(results)
                 print(results)
-                eval_results = results[EVALUATION_RESULTS]
-                print(
-                    f"iter={algo.iteration} "
-                    f"R={eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}"
-                )
+                eval_results = results.get(EVALUATION_RESULTS)
+                if eval_results:
+                    print(
+                        f"iter={algo.iteration} "
+                        f"R={eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}"
+                    )
             check_compute_single_action(algo)
 
             # Get policy and model.
diff --git a/rllib/algorithms/marwil/tests/test_marwil.py b/rllib/algorithms/marwil/tests/test_marwil.py
index 3722554f7dec..4430f6df0856 100644
--- a/rllib/algorithms/marwil/tests/test_marwil.py
+++ b/rllib/algorithms/marwil/tests/test_marwil.py
@@ -60,7 +60,6 @@ def test_marwil_compilation_and_learning_from_offline_file(self):
                 evaluation_parallel_to_training=True,
                 evaluation_config=marwil.MARWILConfig.overrides(input_="sampler"),
                 off_policy_estimation_methods={},
-                always_attach_evaluation_results=True,
             )
             .offline_data(input_=[data_file])
         )
diff --git a/rllib/algorithms/tests/test_algorithm.py b/rllib/algorithms/tests/test_algorithm.py
index af67840d3f33..9fedadaf48bb 100644
--- a/rllib/algorithms/tests/test_algorithm.py
+++ b/rllib/algorithms/tests/test_algorithm.py
@@ -250,7 +250,6 @@ def test_evaluation_option(self):
                 evaluation_duration=2,
                 evaluation_duration_unit="episodes",
                 evaluation_config=dqn.DQNConfig.overrides(gamma=0.98),
-                always_attach_evaluation_results=False,
             )
             .callbacks(callbacks_class=AssertEvalCallback)
         )
@@ -290,14 +289,13 @@ def test_evaluation_option_always_attach_eval_metrics(self):
                 evaluation_duration=2,
                 evaluation_duration_unit="episodes",
                 evaluation_config=dqn.DQNConfig.overrides(gamma=0.98),
-                always_attach_evaluation_results=True,
             )
             .reporting(min_sample_timesteps_per_iteration=100)
             .callbacks(callbacks_class=AssertEvalCallback)
         )
         for _ in framework_iterator(config, frameworks=("torch", "tf")):
             algo = config.build()
-            # Should always see latest available eval results.
+            # Should only see eval results, when eval actually ran.
             r0 = algo.train()
             r1 = algo.train()
             r2 = algo.train()
@@ -307,9 +305,9 @@ def test_evaluation_option_always_attach_eval_metrics(self):
             # Eval results are not available at step 0.
             # But step 3 should still have it, even though no eval was
             # run during that step.
-            self.assertTrue(EVALUATION_RESULTS in r0)
+            self.assertTrue(EVALUATION_RESULTS not in r0)
             self.assertTrue(EVALUATION_RESULTS in r1)
-            self.assertTrue(EVALUATION_RESULTS in r2)
+            self.assertTrue(EVALUATION_RESULTS not in r2)
             self.assertTrue(EVALUATION_RESULTS in r3)
 
     def test_evaluation_wo_evaluation_env_runner_group(self):
diff --git a/rllib/examples/evaluation/evaluation_parallel_to_training.py b/rllib/examples/evaluation/evaluation_parallel_to_training.py
index 382cec33c77e..4d5814ab2489 100644
--- a/rllib/examples/evaluation/evaluation_parallel_to_training.py
+++ b/rllib/examples/evaluation/evaluation_parallel_to_training.py
@@ -164,11 +164,6 @@ def on_train_result(
                     "Number of run evaluation timesteps: "
                     f"{num_timesteps_reported} (ok)!"
                 )
-        # Expect at least evaluation/env_runners to be always available.
-        elif algorithm.config.always_attach_evaluation_results and (
-            not eval_env_runner_results
-        ):
-            raise KeyError("`evaluation->env_runners` not found in result dict!")
 
 
 if __name__ == "__main__":

From 514ca69c94c1db08f1f2bbc65363b510157dacb8 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sat, 22 Jun 2024 09:44:25 +0200
Subject: [PATCH 2/3] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/env/multi_agent_env_runner.py  | 6 ++----
 rllib/env/single_agent_env_runner.py | 2 --
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py
index 98d08bf6c603..6144b5411f60 100644
--- a/rllib/env/multi_agent_env_runner.py
+++ b/rllib/env/multi_agent_env_runner.py
@@ -268,7 +268,7 @@ def _sample_timesteps(
                 # MARLModule forward pass: Explore or not.
                 if explore:
                     env_steps_lifetime = self.metrics.peek(
-                        NUM_ENV_STEPS_SAMPLED_LIFETIME
+                        NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0
                     ) + self.metrics.peek(NUM_ENV_STEPS_SAMPLED, default=0)
                     to_env = self.module.forward_exploration(
                         to_module, t=env_steps_lifetime
@@ -465,7 +465,7 @@ def _sample_episodes(
                 # MARLModule forward pass: Explore or not.
                 if explore:
                     env_steps_lifetime = self.metrics.peek(
-                        NUM_ENV_STEPS_SAMPLED_LIFETIME
+                        NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0
                     ) + self.metrics.peek(NUM_ENV_STEPS_SAMPLED, default=0)
                     to_env = self.module.forward_exploration(
                         to_module, t=env_steps_lifetime
@@ -828,8 +828,6 @@ def _make_module(self):
 
     def _setup_metrics(self):
         self.metrics = MetricsLogger()
-        # Initialize lifetime counts.
-        self.metrics.log_value(NUM_ENV_STEPS_SAMPLED_LIFETIME, 0, reduce="sum")
 
         self._done_episodes_for_metrics: List[MultiAgentEpisode] = []
         self._ongoing_episodes_for_metrics: DefaultDict[
diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py
index 0b6355f16a83..b1a10f89a912 100644
--- a/rllib/env/single_agent_env_runner.py
+++ b/rllib/env/single_agent_env_runner.py
@@ -67,8 +67,6 @@ def __init__(self, config: AlgorithmConfig, **kwargs):
 
         # Create a MetricsLogger object for logging custom stats.
         self.metrics = MetricsLogger()
-        # Initialize lifetime counts.
-        self.metrics.log_value(NUM_ENV_STEPS_SAMPLED_LIFETIME, 0, reduce="sum")
 
         # Create our callbacks object.
         self._callbacks: DefaultCallbacks = self.config.callbacks_class()

From 9eb92a6b4ed989ed744037246f1de6e447b44814 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 26 Jun 2024 09:07:29 +0200
Subject: [PATCH 3/3] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/algorithm_config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 4a2603378be1..ec2d00638512 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -2315,8 +2315,9 @@ def evaluation(
             deprecation_warning(
                 old="AlgorithmConfig.evaluation(evaluation_num_workers=..)",
                 new="AlgorithmConfig.evaluation(evaluation_num_env_runners=..)",
-                error=True,
+                error=False,
             )
+            self.evaluation_num_env_runners = evaluation_num_workers
 
         if evaluation_interval is not NotProvided:
             self.evaluation_interval = evaluation_interval