ray-project · sven1977 · Jul 22, 2024 · Apr 25, 2024 · Apr 25, 2024 · May 6, 2024
@@ -782,6 +782,14 @@ py_test(
 )
 
 # BC
+py_test(
+    name = "test_bc_old_stack",
+    tags = ["team:rllib", "algorithms_dir"],
+    size = "medium",
+    # Include the json data file.
+    data = ["tests/data/cartpole/large.json"],
+    srcs = ["algorithms/bc/tests/test_bc_old_stack.py"]
+)
 py_test(
     name = "test_bc",
     tags = ["team:rllib", "algorithms_dir"],
@@ -1531,6 +1539,16 @@ py_test(
     srcs = ["offline/estimators/tests/test_dr_learning.py"],
 )
 
+py_test(
+    name = "test_offline_data",
+    tags = ["team:rllib", "offline"],
+    size = "small",
+    srcs = ["offline/tests/test_offline_data.py"],
+    data = [
+        "tests/data/pendulum/small.json",
+    ],
+)
+
 # --------------------------------------------------------------------
 # Policies
 # rllib/policy/

@@ -620,12 +620,28 @@ def setup(self, config: AlgorithmConfig) -> None:
             validate_env=self.validate_env,
             default_policy_class=self.get_default_policy_class(self.config),
             config=self.config,
-            num_env_runners=self.config.num_env_runners,
+            num_env_runners=0 if self.config.input_ else self.config.num_env_runners,
             local_env_runner=True,
             logdir=self.logdir,
             tune_trial_id=self.trial_id,
         )
 
+        # Ensure remote workers are initially in sync with the local worker.
+        self.workers.sync_weights(inference_only=True)
+        # If and input path is available and we are on the new API stack generate
+        # an `OfflineData` instance.
+        if (
+            self.config.input_
+            and self.config.input_ != "sampler"
+            and self.config._enable_new_api_stack
+        ):
+            from ray.rllib.offline.offline_data import OfflineData
+
+            self.offline_data = OfflineData(self.config)
+        # Otherwise set the attribute to `None`.
+        else:
+            self.offline_data = None
+
         # Compile, validate, and freeze an evaluation config.
         self.evaluation_config = self.config.get_evaluation_config_object()
         self.evaluation_config.validate()
@@ -706,6 +722,8 @@ def setup(self, config: AlgorithmConfig) -> None:
             # Need to add back method_type in case Algorithm is restored from checkpoint
             method_config["type"] = method_type
 
+        # TODO (sven): Probably obsolete b/c the learner group is already None.
+        self.learner_group = None
         if self.config.enable_rl_module_and_learner:
             local_worker = self.workers.local_worker()
             env = spaces = None
@@ -781,6 +799,29 @@ def setup(self, config: AlgorithmConfig) -> None:
                     ),
                 )
 
+            if self.offline_data:
+                # If the learners are remote we need to provide specific
+                # information and the learner's actor handles.
+                if self.learner_group.is_remote:
+                    # If learners run on different nodes, locality hints help
+                    # to use the nearest learner in the workers that do the
+                    # data preprocessing.
+                    learner_node_ids = self.learner_group.foreach_learner(
+                        lambda l: ray.get_runtime_context().get_node_id()
+                    )
+                    self.offline_data.locality_hints = [
+                        node_id.get() for node_id in learner_node_ids
+                    ]
+                    # Provide the actor handles for the learners for module
+                    # updating during preprocessing.
+                    self.offline_data.learner_handles = self.learner_group._workers
+                    # Provide the module_spec. Note, in the remote case this is needed
+                    # because the learner module cannot be copied, but must be built.
+                    self.offline_data.module_spec = module_spec
+                # Otherwise we can simply pass in the local learner.
+                else:
+                    self.offline_data.learner_handles = [self.learner_group._learner]
+
         # Run `on_algorithm_init` callback after initialization is done.
         self.callbacks.on_algorithm_init(algorithm=self, metrics_logger=self.metrics)
 

@@ -430,6 +430,10 @@ def __init__(self, algo_class: Optional[type] = None):
 
         # `self.offline_data()`
         self.input_ = "sampler"
+        self.input_read_method = "read_parquet"
+        self.input_read_method_kwargs = {}
+        self.prelearner_module_synch_period = 10
+        self.dataset_num_iters_per_learner = None
         self.input_config = {}
         self.actions_in_input_normalized = False
         self.postprocess_inputs = False
@@ -2368,6 +2372,10 @@ def offline_data(
         self,
         *,
         input_=NotProvided,
+        input_read_method=NotProvided,
+        input_read_method_kwargs=NotProvided,
+        prelearner_module_synch_period=NotProvided,
+        dataset_num_iters_per_learner=NotProvided,
         input_config=NotProvided,
         actions_in_input_normalized=NotProvided,
         input_evaluation=NotProvided,
@@ -2392,7 +2400,24 @@ def offline_data(
                 - A callable that takes an `IOContext` object as only arg and returns a
                 ray.rllib.offline.InputReader.
                 - A string key that indexes a callable with tune.registry.register_input
-            input_config: Arguments that describe the settings for reading the input.
+            input_read_method: Read method for the `ray.data.Dataset` to read in the
+                offline data from `input_`. The default is `read_json` for JSON files.
+                See https://docs.ray.io/en/latest/data/api/input_output.html for more
+                info about available read methods in `ray.data`.
+            input_read_method_kwargs: kwargs for the `input_read_method`. These will be
+                passed into the read method without checking.
+            prelearner_module_synch_period: The period (number of batches converted)
+                after which the `RLModule` held by the `PreLearner` should sync weights.
+                The `PreLearner` is used to preprocess batches for the learners. The
+                higher this value the more off-policy the `PreLearner`'s module will be.
+                Values too small will force the `PreLearner` to sync a ,lot with the
+                `Learner` and will slow down the data pipeline. The default value chosen
+                by the `OfflinePreLearner` is 10.
+            dataset_num_iters_per_learner: Number of iterations to run in each learner
+                during a single training iteration. If `None`, each learner runs a
+                complete epoch over its data block (the dataset is partitioned into
+                as many blocks as there are learners). The default is `None`.
+            input_config: Arguments that describe the settings for reading the inpu t.
                 If input is `sample`, this will be environment configuation, e.g.
                 `env_name` and `env_config`, etc. See `EnvContext` for more info.
                 If the input is `dataset`, this will be e.g. `format`, `path`.
@@ -2430,6 +2455,14 @@ def offline_data(
         """
         if input_ is not NotProvided:
             self.input_ = input_
+        if input_read_method is not NotProvided:
+            self.input_read_method = input_read_method
+        if input_read_method_kwargs is not NotProvided:
+            self.input_read_method_kwargs = input_read_method_kwargs
+        if prelearner_module_synch_period is not NotProvided:
+            self.prelearner_module_synch_period = prelearner_module_synch_period
+        if dataset_num_iters_per_learner is not NotProvided:
+            self.dataset_num_iters_per_learner = dataset_num_iters_per_learner
         if input_config is not NotProvided:
             if not isinstance(input_config, dict):
                 raise ValueError(

@@ -4,16 +4,18 @@
 from ray.rllib.algorithms.bc.bc_catalog import BCCatalog
 from ray.rllib.algorithms.marwil.marwil import MARWIL, MARWILConfig
 from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
-from ray.rllib.execution.rollout_ops import synchronous_parallel_sample
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.metrics import (
     ALL_MODULES,
-    NUM_AGENT_STEPS_SAMPLED,
-    NUM_AGENT_STEPS_TRAINED,
-    NUM_ENV_STEPS_SAMPLED,
+    LEARNER_RESULTS,
+    LEARNER_UPDATE_TIMER,
+    OFFLINE_SAMPLING_TIMER,
     NUM_ENV_STEPS_TRAINED,
-    SAMPLE_TIMER,
+    NUM_ENV_STEPS_TRAINED_LIFETIME,
+    NUM_MODULE_STEPS_TRAINED,
+    NUM_MODULE_STEPS_TRAINED_LIFETIME,
     SYNCH_WORKER_WEIGHTS_TIMER,
+    TIMERS,
 )
 from ray.rllib.utils.typing import RLModuleSpec, ResultDict
 
@@ -74,8 +76,10 @@ def __init__(self, algo_class=None):
         # Advantages (calculated during postprocessing)
         # not important for behavioral cloning.
         self.postprocess_inputs = False
-        # Set RLModule as default.
-        self.api_stack(enable_rl_module_and_learner=True)
+        # Set RLModule as default if the `EnvRUnner`'s are used.
+        if self.enable_env_runner_and_connector_v2:
+            self.api_stack(enable_rl_module_and_learner=True)
+
         # __sphinx_doc_end__
         # fmt: on
 
@@ -144,75 +148,64 @@ def training_step(self) -> ResultDict:
             return super().training_step()
         else:
             # Implement logic using RLModule and Learner API.
-            # TODO (sven): Remove RolloutWorkers/EnvRunners for
-            # datasets. Use RolloutWorker/EnvRunner only for
-            # env stepping.
             # TODO (simon): Take care of sampler metrics: right
             # now all rewards are `nan`, which possibly confuses
             # the user that sth. is not right, although it is as
             # we do not step the env.
-            with self._timers[SAMPLE_TIMER]:
+            with self.metrics.log_time((TIMERS, OFFLINE_SAMPLING_TIMER)):
                 # Sampling from offline data.
-                # TODO (simon): We have to remove the `RolloutWorker`
-                # here and just use the already distributed `dataset`
-                # for sampling. Only in online evaluation
-                # `RolloutWorker/EnvRunner` should be used.
-                if self.config.count_steps_by == "agent_steps":
-                    train_batch = synchronous_parallel_sample(
-                        worker_set=self.workers,
-                        max_agent_steps=self.config.train_batch_size,
-                    )
-                else:
-                    train_batch = synchronous_parallel_sample(
-                        worker_set=self.workers,
-                        max_env_steps=self.config.train_batch_size,
-                    )
-
-                # TODO (sven): Use metrics API as soon as we moved to new API stack
-                #  (from currently hybrid stack).
-                # self.metrics.log_dict(
-                #    {
-                #        NUM_AGENT_STEPS_SAMPLED_LIFETIME: len(train_batch),
-                #        NUM_ENV_STEPS_SAMPLED_LIFETIME: len(train_batch),
-                #    },
-                #    reduce="sum",
-                # )
-                self._counters[NUM_AGENT_STEPS_SAMPLED] += len(train_batch)
-                self._counters[NUM_ENV_STEPS_SAMPLED] += len(train_batch)
-
-            # Updating the policy.
-            train_results = self.learner_group.update_from_batch(batch=train_batch)
-            # TODO (sven): Use metrics API as soon as we moved to new API stack
-            #  (from currently hybrid stack).
-            # self.metrics.log_dict(
-            #    {
-            #        NUM_AGENT_STEPS_TRAINED_LIFETIME: len(train_batch),
-            #        NUM_ENV_STEPS_TRAINED_LIFETIME: len(train_batch),
-            #    },
-            #    reduce="sum",
-            # )
-            self._counters[NUM_AGENT_STEPS_TRAINED] += len(train_batch)
-            self._counters[NUM_ENV_STEPS_TRAINED] += len(train_batch)
-
+                batch = self.offline_data.sample(
+                    num_samples=self.config.train_batch_size_per_learner,
+                    num_shards=self.config.num_learners,
+                    return_iterator=True if self.config.num_learners > 1 else False,
+                )
+
+            with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)):
+                # Updating the policy.
+                # TODO (simon, sven): Check, if we should execute directly s.th. like
+                # update_from_iterator.
+                learner_results = self.learner_group.update_from_batch(
+                    batch,
+                    minibatch_size=self.config.train_batch_size_per_learner,
+                    num_iters=self.config.dataset_num_iters_per_learner,
+                )
+
+                # Log training results.
+                self.metrics.merge_and_log_n_dicts(learner_results, key=LEARNER_RESULTS)
+                self.metrics.log_value(
+                    NUM_ENV_STEPS_TRAINED_LIFETIME,
+                    self.metrics.peek(
+                        (LEARNER_RESULTS, ALL_MODULES, NUM_ENV_STEPS_TRAINED)
+                    ),
+                    reduce="sum",
+                )
+                self.metrics.log_dict(
+                    {
+                        (LEARNER_RESULTS, mid, NUM_MODULE_STEPS_TRAINED_LIFETIME): (
+                            stats[NUM_MODULE_STEPS_TRAINED]
+                        )
+                        for mid, stats in self.metrics.peek(LEARNER_RESULTS).items()
+                    },
+                    reduce="sum",
+                )
             # Synchronize weights.
             # As the results contain for each policy the loss and in addition the
             # total loss over all policies is returned, this total loss has to be
             # removed.
-            policies_to_update = set(train_results.keys()) - {ALL_MODULES}
+            modules_to_update = set(learner_results[0].keys()) - {ALL_MODULES}
 
-            # with self.metrics.log_time((TIMERS, SYNCH_WORKER_WEIGHTS_TIMER)):
-            with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
+            # Update weights - after learning on the local worker -
+            # on all remote workers.
+            with self.metrics.log_time((TIMERS, SYNCH_WORKER_WEIGHTS_TIMER)):
                 if self.workers.num_remote_workers() > 0:
                     self.workers.sync_weights(
                         from_worker_or_learner_group=self.learner_group,
-                        policies=policies_to_update,
+                        policies=modules_to_update,
+                        inference_only=True,
                     )
-                # Get weights from Learner to local worker.
+                # Then we must have a local worker.
                 else:
-                    self.workers.local_worker().set_weights(
-                        self.learner_group.get_weights()
-                    )
+                    weights = self.learner_group.get_weights(inference_only=True)
+                    self.workers.local_worker().set_weights(weights)
 
-            # TODO (sven): Use metrics API as soon as we moved to new API stack
-            #  (from currently hybrid stack).
-            return train_results
+            return self.metrics.reduce()
@@ -0,0 +1,25 @@
+from ray.rllib.core.learner.learner import Learner
+from ray.rllib.connectors.common.add_observations_from_episodes_to_batch import (
+    AddObservationsFromEpisodesToBatch,
+)
+from ray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batch import (  # noqa
+    AddNextObservationsFromEpisodesToTrainBatch,
+)
+from ray.rllib.utils.annotations import (
+    override,
+    OverrideToImplementCustomLogic_CallToSuperRecommended,
+)
+
+
+class BCLearner(Learner):
+    @OverrideToImplementCustomLogic_CallToSuperRecommended
+    @override(Learner)
+    def build(self) -> None:
+        super().build()
+        # Prepend a NEXT_OBS from episodes to train batch connector piece (right
+        # after the observation default piece).
+        if self.config.add_default_connectors_to_learner_pipeline:
+            self._learner_connector.insert_after(
+                AddObservationsFromEpisodesToBatch,
+                AddNextObservationsFromEpisodesToTrainBatch(),
+            )