diff --git a/doc/source/rllib/doc_code/checkpoints.py b/doc/source/rllib/doc_code/checkpoints.py index e8ca8ad2b320..00cb1a58b004 100644 --- a/doc/source/rllib/doc_code/checkpoints.py +++ b/doc/source/rllib/doc_code/checkpoints.py @@ -9,7 +9,15 @@ # Base config used for both pickle-based checkpoint and msgpack-based one. -config = PPOConfig().environment("CartPole-v1").env_runners(num_env_runners=0) +config = ( + PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) + .environment("CartPole-v1") + .env_runners(num_env_runners=0) +) # Build algorithm object. algo1 = config.build() diff --git a/doc/source/rllib/doc_code/custom_gym_env.py b/doc/source/rllib/doc_code/custom_gym_env.py index ff659e22b1f4..925b212e4d2a 100644 --- a/doc/source/rllib/doc_code/custom_gym_env.py +++ b/doc/source/rllib/doc_code/custom_gym_env.py @@ -1,5 +1,6 @@ # __rllib-custom-gym-env-begin__ import gymnasium as gym +import numpy as np import ray from ray.rllib.algorithms.ppo import PPOConfig @@ -8,23 +9,23 @@ class SimpleCorridor(gym.Env): def __init__(self, config): self.end_pos = config["corridor_length"] - self.cur_pos = 0 + self.cur_pos = 0.0 self.action_space = gym.spaces.Discrete(2) # right/left - self.observation_space = gym.spaces.Discrete(self.end_pos) + self.observation_space = gym.spaces.Box(0.0, self.end_pos, shape=(1,)) def reset(self, *, seed=None, options=None): - self.cur_pos = 0 - return self.cur_pos, {} + self.cur_pos = 0.0 + return np.array([self.cur_pos]), {} def step(self, action): - if action == 0 and self.cur_pos > 0: # move right (towards goal) - self.cur_pos -= 1 + if action == 0 and self.cur_pos > 0.0: # move right (towards goal) + self.cur_pos -= 1.0 elif action == 1: # move left (towards start) - self.cur_pos += 1 + self.cur_pos += 1.0 if self.cur_pos >= self.end_pos: - return 0, 1.0, True, True, {} + return np.array([0.0]), 1.0, True, True, {} else: - return self.cur_pos, -0.1, False, False, {} + return np.array([self.cur_pos]), -0.1, False, False, {} ray.init() diff --git a/doc/source/rllib/doc_code/rllib_in_60s.py b/doc/source/rllib/doc_code/rllib_in_60s.py index a17de677cee0..6d214504f15d 100644 --- a/doc/source/rllib/doc_code/rllib_in_60s.py +++ b/doc/source/rllib/doc_code/rllib_in_60s.py @@ -2,20 +2,24 @@ # __rllib-in-60s-begin__ from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.connectors.env_to_module import FlattenObservations -config = ( # 1. Configure the algorithm, +# 1. Configure the algorithm, +config = ( PPOConfig() - .environment(env="Taxi-v3") - .env_runners(num_env_runners=2) - .rl_module(model_config=DefaultModelConfig(fcnet_hiddens=[64, 64])) + .environment("Taxi-v3") + .env_runners( + num_env_runners=2, + # Observations are discrete (ints) -> We need to flatten (one-hot) them. + env_to_module_connector=lambda env: FlattenObservations(), + ) .evaluation(evaluation_num_env_runners=1) ) - -algo = config.build() # 2. build the algorithm, - +# 2. build the algorithm .. +algo = config.build() +# 3. .. train it .. for _ in range(5): - print(algo.train()) # 3. train it, - -algo.evaluate() # 4. and evaluate it. + print(algo.train()) +# 4. .. and evaluate it. +algo.evaluate() # __rllib-in-60s-end__ diff --git a/doc/source/rllib/doc_code/rllib_on_ray_readme.py b/doc/source/rllib/doc_code/rllib_on_ray_readme.py index 93fbac10814c..6498a1732a91 100644 --- a/doc/source/rllib/doc_code/rllib_on_ray_readme.py +++ b/doc/source/rllib/doc_code/rllib_on_ray_readme.py @@ -1,5 +1,8 @@ # __quick_start_begin__ import gymnasium as gym +import numpy as np +import torch + from ray.rllib.algorithms.ppo import PPOConfig @@ -19,9 +22,9 @@ class SimpleCorridor(gym.Env): def __init__(self, config): self.end_pos = config["corridor_length"] - self.cur_pos = 0 + self.cur_pos = 0.0 self.action_space = gym.spaces.Discrete(2) # left and right - self.observation_space = gym.spaces.Box(0.0, self.end_pos, shape=(1,)) + self.observation_space = gym.spaces.Box(0.0, self.end_pos, (1,), np.float32) def reset(self, *, seed=None, options=None): """Resets the episode. @@ -29,9 +32,9 @@ def reset(self, *, seed=None, options=None): Returns: Initial observation of the new episode and an info dict. """ - self.cur_pos = 0 + self.cur_pos = 0.0 # Return initial observation. - return [self.cur_pos], {} + return np.array([self.cur_pos], np.float32), {} def step(self, action): """Takes a single step in the episode given `action`. @@ -50,23 +53,24 @@ def step(self, action): truncated = False # +1 when goal reached, otherwise -1. reward = 1.0 if terminated else -0.1 - return [self.cur_pos], reward, terminated, truncated, {} + return np.array([self.cur_pos], np.float32), reward, terminated, truncated, {} # Create an RLlib Algorithm instance from a PPOConfig object. config = ( PPOConfig().environment( # Env class to use (here: our gym.Env sub-class from above). - env=SimpleCorridor, + SimpleCorridor, # Config dict to be passed to our custom env's constructor. # Use corridor with 20 fields (including S and G). - env_config={"corridor_length": 28}, + env_config={"corridor_length": 20}, ) # Parallelize environment rollouts. .env_runners(num_env_runners=3) ) # Construct the actual (PPO) algorithm object from the config. algo = config.build() +rl_module = algo.get_module() # Train for n iterations and report results (mean episode rewards). # Since we have to move at least 19 times in the env to reach the goal and @@ -74,7 +78,7 @@ def step(self, action): # Expect to reach an optimal episode reward of `-0.1*18 + 1.0 = -0.8`. for i in range(5): results = algo.train() - print(f"Iter: {i}; avg. return={results['env_runners']['episode_return_mean']}") + print(f"Iter: {i}; avg. results={results['env_runners']}") # Perform inference (action computations) based on given env observations. # Note that we are using a slightly different env here (len 10 instead of 20), @@ -89,7 +93,12 @@ def step(self, action): while not terminated and not truncated: # Compute a single action, given the current observation # from the environment. - action = algo.compute_single_action(obs) + action_logits = rl_module.forward_inference( + {"obs": torch.from_numpy(obs).unsqueeze(0)} + )["action_dist_inputs"].numpy()[ + 0 + ] # [0]: B=1 + action = np.argmax(action_logits) # Apply the computed action in the environment. obs, reward, terminated, truncated, info = env.step(action) # Sum up rewards for reporting purposes. diff --git a/doc/source/rllib/doc_code/rlmodule_guide.py b/doc/source/rllib/doc_code/rlmodule_guide.py index a9f94f494125..cc7eb92c19d1 100644 --- a/doc/source/rllib/doc_code/rlmodule_guide.py +++ b/doc/source/rllib/doc_code/rlmodule_guide.py @@ -8,15 +8,7 @@ from ray.rllib.algorithms.ppo import PPOConfig -config = ( - PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) - .framework("torch") - .environment("CartPole-v1") -) +config = PPOConfig().framework("torch").environment("CartPole-v1") algorithm = config.build() @@ -235,21 +227,15 @@ def _forward_train(self, batch: Dict[str, Any]) -> Dict[str, Any]: class BCTorchRLModuleWithSharedGlobalEncoder(TorchRLModule): """An RLModule with a shared encoder between agents for global observation.""" - def __init__( - self, - encoder: nn.Module, - local_dim: int, - hidden_dim: int, - action_dim: int, - config=None, - ) -> None: - super().__init__(config=config) - - self.encoder = encoder + def setup(self): + self.encoder = self.model_config["encoder"] self.policy_head = nn.Sequential( - nn.Linear(hidden_dim + local_dim, hidden_dim), + nn.Linear( + self.model_config["hidden_dim"] + self.model_config["local_dim"], + self.model_config["hidden_dim"], + ), nn.ReLU(), - nn.Linear(hidden_dim, action_dim), + nn.Linear(self.model_config["hidden_dim"], self.model_config["action_dim"]), ) def _forward_inference(self, batch: Dict[str, Any]) -> Dict[str, Any]: @@ -288,11 +274,14 @@ def setup(self): rl_modules = {} for module_id, module_spec in module_specs.items(): rl_modules[module_id] = BCTorchRLModuleWithSharedGlobalEncoder( - config=module_specs[module_id].get_rl_module_config(), - encoder=shared_encoder, - local_dim=module_spec.observation_space["local"].shape[0], - hidden_dim=hidden_dim, - action_dim=module_spec.action_space.n, + observation_space=module_spec.observation_space, + action_space=module_spec.action_space, + model_config={ + "local_dim": module_spec.observation_space["local"].shape[0], + "hidden_dim": hidden_dim, + "action_dim": module_spec.action_space.n, + "encoder": shared_encoder, + }, ) self._rl_modules = rl_modules @@ -345,14 +334,7 @@ def setup(self): from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule from ray.rllib.core.rl_module.rl_module import RLModule, RLModuleSpec -config = ( - PPOConfig() - # Enable the new API stack (RLModule and Learner APIs). - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ).environment("CartPole-v1") -) +config = PPOConfig().environment("CartPole-v1") env = gym.make("CartPole-v1") # Create an RL Module that we would like to checkpoint module_spec = RLModuleSpec( diff --git a/doc/source/rllib/doc_code/saving_and_loading_algos_and_policies.py b/doc/source/rllib/doc_code/saving_and_loading_algos_and_policies.py index 9202c904f5f9..0ba5e06b7775 100644 --- a/doc/source/rllib/doc_code/saving_and_loading_algos_and_policies.py +++ b/doc/source/rllib/doc_code/saving_and_loading_algos_and_policies.py @@ -4,7 +4,14 @@ # Create a PPO algorithm object using a config object .. from ray.rllib.algorithms.ppo import PPOConfig -my_ppo_config = PPOConfig().environment("CartPole-v1") +my_ppo_config = ( + PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) + .environment("CartPole-v1") +) my_ppo = my_ppo_config.build() # .. train one iteration .. @@ -60,21 +67,28 @@ from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole # Set up a multi-agent Algorithm, training two policies independently. -my_ma_config = PPOConfig().multi_agent( - # Which policies should RLlib create and train? - policies={"pol1", "pol2"}, - # Let RLlib know, which agents in the environment (we'll have "agent1" - # and "agent2") map to which policies. - policy_mapping_fn=( - lambda agent_id, episode, worker, **kw: ( - "pol1" if agent_id == "agent1" else "pol2" - ) - ), - # Setting these isn't necessary. All policies will always be trained by default. - # However, since we do provide a list of IDs here, we need to remain in charge of - # changing this `policies_to_train` list, should we ever alter the Algorithm - # (e.g. remove one of the policies or add a new one). - policies_to_train=["pol1", "pol2"], # Again, `None` would be totally fine here. +my_ma_config = ( + PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) + .multi_agent( + # Which policies should RLlib create and train? + policies={"pol1", "pol2"}, + # Let RLlib know, which agents in the environment (we'll have "agent1" + # and "agent2") map to which policies. + policy_mapping_fn=( + lambda agent_id, episode, worker, **kw: ( + "pol1" if agent_id == "agent1" else "pol2" + ) + ), + # Setting these isn't necessary. All policies will always be trained by default. + # However, since we do provide a list of IDs here, we need to remain in charge of + # changing this `policies_to_train` list, should we ever alter the Algorithm + # (e.g. remove one of the policies or add a new one). + policies_to_train=["pol1", "pol2"], # Again, `None` would be totally fine here. + ) ) # Add the MultiAgentCartPole env to our config and build our Algorithm. @@ -168,6 +182,10 @@ # Set up an Algorithm with 5 Policies. algo_w_5_policies = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment( env=MultiAgentCartPole, env_config={ @@ -225,7 +243,13 @@ def new_policy_mapping_fn(agent_id, episode, worker, **kwargs): # Create a new Algorithm (which contains a Policy, which contains a NN Model). # Switch on for native models to be included in the Policy checkpoints. ppo_config = ( - PPOConfig().environment("Pendulum-v1").checkpointing(export_native_model_files=True) + PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) + .environment("Pendulum-v1") + .checkpointing(export_native_model_files=True) ) # The default framework is TensorFlow, but if you would like to do this example with diff --git a/doc/source/rllib/key-concepts.rst b/doc/source/rllib/key-concepts.rst index 9efd1d86a3c9..25f1e21c9642 100644 --- a/doc/source/rllib/key-concepts.rst +++ b/doc/source/rllib/key-concepts.rst @@ -73,7 +73,15 @@ which implements the proximal policy optimization algorithm in RLlib. # Configure. from ray.rllib.algorithms.ppo import PPOConfig - config = PPOConfig().environment(env="CartPole-v1").training(train_batch_size=4000) + config = ( + PPOConfig() + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) + .environment("CartPole-v1") + .training(train_batch_size_per_learner=4000) + ) # Build. algo = config.build() @@ -91,7 +99,15 @@ which implements the proximal policy optimization algorithm in RLlib. # Configure. from ray.rllib.algorithms.ppo import PPOConfig - config = PPOConfig().environment(env="CartPole-v1").training(train_batch_size=4000) + config = ( + PPOConfig() + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) + .environment("CartPole-v1") + .training(train_batch_size_per_learner=4000) + ) # Train via Ray Tune. tune.run("PPO", config=config) diff --git a/doc/source/rllib/rllib-learner.rst b/doc/source/rllib/rllib-learner.rst index 712a24146054..ff75cfe45859 100644 --- a/doc/source/rllib/rllib-learner.rst +++ b/doc/source/rllib/rllib-learner.rst @@ -57,10 +57,6 @@ arguments in the :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConf config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .learners( num_learners=0, # Set this to greater than 1 to allow for DDP style updates. num_gpus_per_learner=0, # Set this to 1 to enable GPU training. @@ -177,6 +173,9 @@ and :py:class:`~ray.rllib.core.learner.learner.Learner` APIs via the :py:class:` # Construct a new Learner using our config object. learner = config.build_learner(env=env) + # Needs to be called on the learner before calling any functions. + learner.build() + Updates ------- @@ -217,8 +216,8 @@ Updates } default_batch = SampleBatch(DUMMY_BATCH) DUMMY_BATCH = default_batch.as_multi_agent() - - learner.build() # needs to be called on the learner before calling any functions + # Make sure, we convert the batch to the correct framework (here: torch). + DUMMY_BATCH = learner._convert_batch_type(DUMMY_BATCH) .. tab-set:: diff --git a/rllib/BUILD b/rllib/BUILD index f040dbab4e73..080d42f883df 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -772,37 +772,6 @@ py_test( args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=2"] ) -#@OldAPIStack -py_test( - name = "learning_tests_pendulum_ppo_old_api_stack", - main = "tests/run_regression_tests.py", - tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_continuous", "no_tf_static_graph"], - size = "large", # bazel may complain about it being too long sometimes - large is on purpose as some frameworks take longer - srcs = ["tests/run_regression_tests.py"], - data = ["tuned_examples/ppo/pendulum-ppo.yaml"], - args = ["--dir=tuned_examples/ppo"] -) -#@OldAPIStack -py_test( - name = "learning_tests_transformed_actions_pendulum_ppo_old_api_stack", - main = "tests/run_regression_tests.py", - tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_continuous", "no_tf_static_graph"], - size = "large", # bazel may complain about it being too long sometimes - large is on purpose as some frameworks take longer - srcs = ["tests/run_regression_tests.py"], - data = ["tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml"], - args = ["--dir=tuned_examples/ppo"] -) -#@OldAPIStack -py_test( - name = "learning_tests_repeat_after_me_ppo_old_api_stack", - main = "tests/run_regression_tests.py", - tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_discrete"], - size = "medium", - srcs = ["tests/run_regression_tests.py"], - data = ["tuned_examples/ppo/repeatafterme-ppo-lstm.yaml"], - args = ["--dir=tuned_examples/ppo"] -) - # SAC # Pendulum py_test( @@ -2078,13 +2047,6 @@ py_test( srcs = ["tests/test_placement_groups.py"] ) -py_test( - name = "tests/test_ray_client", - tags = ["team:rllib", "tests_dir"], - size = "large", - srcs = ["tests/test_ray_client.py"] -) - py_test( name = "tests/test_reproducibility", tags = ["team:rllib", "tests_dir"], @@ -3173,42 +3135,6 @@ py_test( args = ["--as-test", "--framework=torch", "--stop-reward=7.2"] ) -py_test( - name = "examples/custom_recurrent_rnn_tokenizer_repeat_after_me_tf2", - main = "examples/custom_recurrent_rnn_tokenizer.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/custom_recurrent_rnn_tokenizer.py"], - args = ["--as-test", "--framework=tf2", "--stop-reward=40", "--env=RepeatAfterMeEnv", "--num-cpus=4"] -) - -py_test( - name = "examples/custom_recurrent_rnn_tokenizer_repeat_initial_obs_env_tf2", - main = "examples/custom_recurrent_rnn_tokenizer.py", - tags = ["team:rllib", "examples"], - size = "medium", - srcs = ["examples/custom_recurrent_rnn_tokenizer.py"], - args = ["--as-test", "--framework=tf2", "--stop-reward=10", "--stop-timesteps=300000", "--env=RepeatInitialObsEnv", "--num-cpus=4"] -) - -py_test( - name = "examples/custom_recurrent_rnn_tokenizer_repeat_after_me_torch", - main = "examples/custom_recurrent_rnn_tokenizer.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/custom_recurrent_rnn_tokenizer.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=40", "--env=RepeatAfterMeEnv", "--num-cpus=4"] -) - -py_test( - name = "examples/custom_recurrent_rnn_tokenizer_repeat_initial_obs_env_torch", - main = "examples/custom_recurrent_rnn_tokenizer.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/custom_recurrent_rnn_tokenizer.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=10", "--stop-timesteps=300000", "--env=RepeatInitialObsEnv", "--num-cpus=4"] -) - py_test( name = "examples/replay_buffer_api", tags = ["team:rllib", "examples"], diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 863e06eec904..15cd2d81d9d7 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -2544,15 +2544,6 @@ def export_policy_model( onnx: If given, will export model in ONNX format. The value of this parameter set the ONNX OpSet version to use. If None, the output format will be DL framework specific. - - .. testcode:: - - from ray.rllib.algorithms.ppo import PPO, PPOConfig - config = PPOConfig().environment("CartPole-v1") - algo = PPO(config=config) - algo.train() - algo.export_policy_checkpoint("/tmp/export_dir") - algo.export_policy_model("/tmp/dir") """ self.get_policy(policy_id).export_model(export_dir, onnx) @@ -2573,14 +2564,6 @@ def export_policy_checkpoint( Raises: KeyError: if `policy_id` cannot be found in this Algorithm. - - .. testcode:: - - from ray.rllib.algorithms.ppo import PPO, PPOConfig - config = PPOConfig().environment("CartPole-v1") - algo = PPO(config=config) - algo.train() - algo.export_policy_checkpoint("/tmp/export_dir") """ policy = self.get_policy(policy_id) if policy is None: diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index d444c4347683..5bc6daca9db7 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -19,6 +19,7 @@ ) import gymnasium as gym +import tree from packaging import version import ray @@ -58,6 +59,7 @@ deserialize_type, serialize_type, ) +from ray.rllib.utils.test_utils import check from ray.rllib.utils.torch_utils import TORCH_COMPILE_REQUIRED_VERSION from ray.rllib.utils.typing import ( AgentID, @@ -701,12 +703,18 @@ def update_from_dict( # Namely, we want to re-instantiate the exploration config this config had # inside `self.experimental()` before potentially overwriting it in the # following. - enable_rl_module_and_learner = config_dict.get( + enable_new_api_stack = config_dict.get( "_enable_new_api_stack", - config_dict.get("enable_rl_module_and_learner"), + config_dict.get( + "enable_rl_module_and_learner", + config_dict.get("enable_env_runner_and_connector_v2"), + ), ) - if enable_rl_module_and_learner: - self.api_stack(enable_rl_module_and_learner=enable_rl_module_and_learner) + if enable_new_api_stack is not None: + self.api_stack( + enable_rl_module_and_learner=enable_new_api_stack, + enable_env_runner_and_connector_v2=enable_new_api_stack, + ) # Modify our properties one by one. for key, value in config_dict.items(): @@ -750,7 +758,7 @@ def update_from_dict( elif key.startswith("evaluation_"): eval_call[key] = value elif key == "exploration_config": - if enable_rl_module_and_learner: + if enable_new_api_stack: self.exploration_config = value continue if isinstance(value, dict) and "type" in value: @@ -4420,6 +4428,7 @@ def _validate_input_settings(self): def _validate_new_api_stack_settings(self): """Checks, whether settings related to the new API stack make sense.""" + # Old API stack checks. if not self.enable_rl_module_and_learner: # Throw a warning if the user has used `self.rl_module(rl_module_spec=...)` # but has not enabled the new API stack at the same time. @@ -4462,6 +4471,26 @@ def _validate_new_api_stack_settings(self): "to False (old API stack), instead." ) + # For those users that accidentally use the new API stack (because it's the + # default now for many algos), we need to make sure they are warned. + try: + tree.assert_same_structure(self.model, MODEL_DEFAULTS) + # Create copies excluding the specified key + check( + {k: v for k, v in self.model.items() if k != "vf_share_layers"}, + {k: v for k, v in MODEL_DEFAULTS.items() if k != "vf_share_layers"}, + ) + except Exception: + logger.warning( + "You configured a custom `model` config (probably through calling " + "config.training(model=..), whereas your config uses the new API " + "stack! In order to switch off the new API stack, set in your config: " + "`config.api_stack(enable_rl_module_and_learner=False, " + "enable_env_runner_and_connector_v2=False)`. If you DO want to use " + "the new API stack, configure your model, instead, through: " + "`config.rl_module(model_config={..})`." + ) + # LR-schedule checking. Scheduler.validate( fixed_value_or_schedule=self.lr, diff --git a/rllib/algorithms/bc/torch/bc_torch_rl_module.py b/rllib/algorithms/bc/torch/bc_torch_rl_module.py index d06c323b124e..bcdd3660e194 100644 --- a/rllib/algorithms/bc/torch/bc_torch_rl_module.py +++ b/rllib/algorithms/bc/torch/bc_torch_rl_module.py @@ -10,6 +10,8 @@ class BCTorchRLModule(TorchRLModule): @override(RLModule) def setup(self): + if self.catalog is None and hasattr(self, "_catalog_ctor_error"): + raise self._catalog_ctor_error # __sphinx_doc_begin__ # Build models from catalog. self.encoder = self.catalog.build_encoder(framework=self.framework) diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py index 91ca34450f6f..622718055e37 100644 --- a/rllib/algorithms/dqn/dqn.py +++ b/rllib/algorithms/dqn/dqn.py @@ -427,11 +427,12 @@ def validate(self) -> None: # Warn about new API stack on by default. if self.enable_rl_module_and_learner: logger.warning( - "You are running DQN on the new API stack! This is the new default " - "behavior for this algorithm. If you don't want to use the new API " - "stack, set `config.api_stack(enable_rl_module_and_learner=False, " - "enable_env_runner_and_connector_v2=False)`. For a detailed " - "migration guide, see here: https://docs.ray.io/en/master/rllib/new-api-stack-migration-guide.html" # noqa + f"You are running {self.algo_class.__name__} on the new API stack! " + "This is the new default behavior for this algorithm. If you don't " + "want to use the new API stack, set `config.api_stack(" + "enable_rl_module_and_learner=False," + "enable_env_runner_and_connector_v2=False)`. For a detailed migration " + "guide, see here: https://docs.ray.io/en/master/rllib/new-api-stack-migration-guide.html" # noqa ) if ( diff --git a/rllib/algorithms/dqn/dqn_rainbow_rl_module.py b/rllib/algorithms/dqn/dqn_rainbow_rl_module.py index c6dbafead5ae..2d7c1f97c0a8 100644 --- a/rllib/algorithms/dqn/dqn_rainbow_rl_module.py +++ b/rllib/algorithms/dqn/dqn_rainbow_rl_module.py @@ -29,6 +29,9 @@ class DQNRainbowRLModule(RLModule, InferenceOnlyAPI, TargetNetworkAPI): @override(RLModule) def setup(self): + if self.catalog is None and hasattr(self, "_catalog_ctor_error"): + raise self._catalog_ctor_error + # If a dueling architecture is used. self.uses_dueling: bool = self.model_config.get("dueling") # If double Q learning is used. diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 6077bd35ae91..172411d9276f 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -70,11 +70,6 @@ class PPOConfig(AlgorithmConfig): from ray.rllib.algorithms.ppo import PPOConfig config = PPOConfig() - # Activate new API stack. - config.api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) config.environment("CartPole-v1") config.env_runners(num_env_runners=1) config.training( @@ -93,11 +88,6 @@ class PPOConfig(AlgorithmConfig): config = ( PPOConfig() - # Activate new API stack. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) # Set the config object's env. .environment(env="CartPole-v1") # Update the config object's training parameters. @@ -122,6 +112,16 @@ def __init__(self, algo_class=None): """Initializes a PPOConfig instance.""" super().__init__(algo_class=algo_class or PPO) + self.exploration_config = { + # The Exploration class to use. In the simplest case, this is the name + # (str) of any class present in the `rllib.utils.exploration` package. + # You can also provide the python class directly or the full location + # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. + # EpsilonGreedy"). + "type": "StochasticSampling", + # Add constructor kwargs here (if any). + } + # fmt: off # __sphinx_doc_begin__ self.lr_schedule = None @@ -149,6 +149,12 @@ def __init__(self, algo_class=None): # Override some of AlgorithmConfig's default values with PPO-specific values. self.num_env_runners = 2 self.model["vf_share_layers"] = False + + # `.api_stack()` + self.api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) # __sphinx_doc_end__ # fmt: on @@ -156,16 +162,6 @@ def __init__(self, algo_class=None): self.sgd_minibatch_size = DEPRECATED_VALUE self.vf_share_layers = DEPRECATED_VALUE - self.exploration_config = { - # The Exploration class to use. In the simplest case, this is the name - # (str) of any class present in the `rllib.utils.exploration` package. - # You can also provide the python class directly or the full location - # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. - # EpsilonGreedy"). - "type": "StochasticSampling", - # Add constructor kwargs here (if any). - } - @override(AlgorithmConfig) def get_default_rl_module_spec(self) -> RLModuleSpec: from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog @@ -304,6 +300,17 @@ def validate(self) -> None: # Call super's validation method. super().validate() + # Warn about new API stack on by default. + if self.enable_rl_module_and_learner: + logger.warning( + f"You are running {self.algo_class.__name__} on the new API stack! " + "This is the new default behavior for this algorithm. If you don't " + "want to use the new API stack, set `config.api_stack(" + "enable_rl_module_and_learner=False," + "enable_env_runner_and_connector_v2=False)`. For a detailed migration " + "guide, see here: https://docs.ray.io/en/master/rllib/new-api-stack-migration-guide.html" # noqa + ) + # Synchronous sampling, on-policy/PPO algos -> Check mismatches between # `rollout_fragment_length` and `train_batch_size_per_learner` to avoid user # confusion. diff --git a/rllib/algorithms/ppo/ppo_rl_module.py b/rllib/algorithms/ppo/ppo_rl_module.py index 30ca5d843df1..833e8d9d4227 100644 --- a/rllib/algorithms/ppo/ppo_rl_module.py +++ b/rllib/algorithms/ppo/ppo_rl_module.py @@ -19,6 +19,9 @@ class PPORLModule(RLModule, InferenceOnlyAPI, ValueFunctionAPI, abc.ABC): @override(RLModule) def setup(self): + if self.catalog is None and hasattr(self, "_catalog_ctor_error"): + raise self._catalog_ctor_error + # __sphinx_doc_begin__ # If we have a stateful model, states for the critic need to be collected # during sampling and `inference-only` needs to be `False`. Note, at this diff --git a/rllib/algorithms/ppo/tests/test_ppo.py b/rllib/algorithms/ppo/tests/test_ppo.py index 3febf97fb2ca..575bcece9897 100644 --- a/rllib/algorithms/ppo/tests/test_ppo.py +++ b/rllib/algorithms/ppo/tests/test_ppo.py @@ -66,11 +66,6 @@ def test_ppo_compilation_and_schedule_mixins(self): # Build a PPOConfig object with the `SingleAgentEnvRunner` class. config = ( ppo.PPOConfig() - # Enable new API stack and use EnvRunner. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners(num_env_runners=0) .training( num_epochs=2, @@ -93,12 +88,10 @@ def test_ppo_compilation_and_schedule_mixins(self): num_iterations = 2 - # TODO (sven) Bring back "FrozenLake-v1" for env in [ - # "CliffWalking-v0", "CartPole-v1", "Pendulum-v1", - ]: # "ale_py:ALE/Breakout-v5"]: + ]: print("Env={}".format(env)) for lstm in [False]: print("LSTM={}".format(lstm)) @@ -132,10 +125,6 @@ def test_ppo_free_log_std(self): """Tests the free log std option works.""" config = ( ppo.PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("Pendulum-v1") .env_runners( num_env_runners=1, diff --git a/rllib/algorithms/ppo/tests/test_ppo_learner.py b/rllib/algorithms/ppo/tests/test_ppo_learner.py index 69ceab171497..1d5f83639bb9 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_learner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_learner.py @@ -52,10 +52,6 @@ def test_save_to_path_and_restore_from_path(self): """Tests saving and loading the state of the PPO Learner Group.""" config = ( ppo.PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("CartPole-v1") .env_runners( num_env_runners=0, @@ -92,10 +88,6 @@ def test_kl_coeff_changes(self): initial_kl_coeff = 0.01 config = ( ppo.PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("CartPole-v1") .env_runners( num_env_runners=0, diff --git a/rllib/algorithms/ppo/tests/test_ppo_old_api_stack.py b/rllib/algorithms/ppo/tests/test_ppo_old_api_stack.py index edb2b3b3122e..c55bf2445b92 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_old_api_stack.py +++ b/rllib/algorithms/ppo/tests/test_ppo_old_api_stack.py @@ -125,6 +125,10 @@ def test_ppo_compilation_w_connectors(self): # Build a PPOConfig object. config = ( ppo.PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .training( num_epochs=2, # Setup lr schedule for testing. @@ -190,6 +194,10 @@ def test_ppo_compilation_and_schedule_mixins(self): # Build a PPOConfig object. config = ( ppo.PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .training( # Setup lr schedule for testing. lr_schedule=[[0, 5e-5], [256, 0.0]], @@ -255,6 +263,10 @@ def test_ppo_exploration_setup(self): """Tests, whether PPO runs with different exploration setups.""" config = ( ppo.PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment( "FrozenLake-v1", env_config={"is_slippery": False, "map_name": "4x4"}, @@ -303,6 +315,10 @@ def test_ppo_free_log_std(self): config = ( ppo.PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment("CartPole-v1") .env_runners( num_env_runners=0, @@ -353,6 +369,10 @@ def test_ppo_loss_function(self): """ config = ( ppo.PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment("CartPole-v1") .env_runners( num_env_runners=0, diff --git a/rllib/algorithms/sac/sac_rl_module.py b/rllib/algorithms/sac/sac_rl_module.py index 832df79e9ede..d6d1b783d326 100644 --- a/rllib/algorithms/sac/sac_rl_module.py +++ b/rllib/algorithms/sac/sac_rl_module.py @@ -54,6 +54,9 @@ class SACRLModule(RLModule, InferenceOnlyAPI, TargetNetworkAPI): @override(RLModule) def setup(self): + if self.catalog is None and hasattr(self, "_catalog_ctor_error"): + raise self._catalog_ctor_error + # If a twin Q architecture should be used. self.twin_q = self.model_config["twin_q"] diff --git a/rllib/algorithms/sac/tests/test_sac.py b/rllib/algorithms/sac/tests/test_sac.py index b9f0eba34ec8..53c5749f7966 100644 --- a/rllib/algorithms/sac/tests/test_sac.py +++ b/rllib/algorithms/sac/tests/test_sac.py @@ -7,11 +7,6 @@ from ray.rllib.algorithms import sac from ray.rllib.connectors.env_to_module.flatten_observations import FlattenObservations from ray.rllib.examples.envs.classes.random_env import RandomEnv -from ray.rllib.examples._old_api_stack.models.batch_norm_model import ( - KerasBatchNormModel, - TorchBatchNormModel, -) -from ray.rllib.models.catalog import ModelCatalog from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.spaces.simplex import Simplex from ray.rllib.utils.test_utils import check_train_results_new_api_stack @@ -80,9 +75,6 @@ def test_sac_compilation(self): ) num_iterations = 1 - ModelCatalog.register_custom_model("batch_norm", KerasBatchNormModel) - ModelCatalog.register_custom_model("batch_norm_torch", TorchBatchNormModel) - image_space = Box(-1.0, 1.0, shape=(84, 84, 3)) simple_space = Box(-1.0, 1.0, shape=(3,)) diff --git a/rllib/algorithms/tests/test_algorithm.py b/rllib/algorithms/tests/test_algorithm.py index aba304cc3e16..2175eb62091f 100644 --- a/rllib/algorithms/tests/test_algorithm.py +++ b/rllib/algorithms/tests/test_algorithm.py @@ -3,7 +3,6 @@ import os from pathlib import Path from random import choice -import time import unittest import ray @@ -27,7 +26,6 @@ LEARNER_RESULTS, ) from ray.rllib.utils.metrics.learner_info import LEARNER_INFO -from ray.rllib.utils.test_utils import check from ray.tune import register_env @@ -44,10 +42,6 @@ def tearDownClass(cls): def test_add_module_and_remove_module(self): config = ( ppo.PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment( env="multi_cart", env_config={"num_agents": 4}, @@ -213,6 +207,10 @@ def new_mapping_fn(agent_id, episode, **kwargs): def test_add_policy_and_remove_policy(self): config = ( ppo.PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment( env=MultiAgentCartPole, env_config={ @@ -485,6 +483,10 @@ def test_evaluation_wo_evaluation_env_runner_group(self): # configured exact number of episodes per evaluation. config = ( ppo.PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment(env="CartPole-v1") .callbacks(callbacks_class=AssertEvalCallback) ) @@ -513,29 +515,6 @@ def test_evaluation_wo_evaluation_env_runner_group(self): algo_w_env_on_local_worker.stop() config.create_env_on_local_worker = False - def test_worker_validation_time(self): - """Tests the time taken by `validate_env_runners_after_construction=True`.""" - config = ppo.PPOConfig().environment(env="CartPole-v1") - config.validate_env_runners_after_construction = True - - # Test, whether validating one worker takes just as long as validating - # >> 1 workers. - config.num_env_runners = 1 - t0 = time.time() - algo = config.build() - total_time_1 = time.time() - t0 - print(f"Validating w/ 1 worker: {total_time_1}sec") - algo.stop() - - config.num_env_runners = 5 - t0 = time.time() - algo = config.build() - total_time_5 = time.time() - t0 - print(f"Validating w/ 5 workers: {total_time_5}sec") - algo.stop() - - check(total_time_5 / total_time_1, 1.0, atol=1.0) - def test_no_env_but_eval_workers_do_have_env(self): """Tests whether no env on workers, but env on eval workers works ok.""" script_path = Path(__file__) @@ -570,7 +549,14 @@ def test_no_env_but_eval_workers_do_have_env(self): def test_counters_after_checkpoint(self): # We expect algorithm to no start counters from zero after loading a # checkpoint on a fresh Algorithm instance - config = ppo.PPOConfig().environment(env="CartPole-v1") + config = ( + ppo.PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) + .environment(env="CartPole-v1") + ) algo = config.build() self.assertTrue(all(c == 0 for c in algo._counters.values())) diff --git a/rllib/algorithms/tests/test_algorithm_config.py b/rllib/algorithms/tests/test_algorithm_config.py index b88f16636698..36da463d43a9 100644 --- a/rllib/algorithms/tests/test_algorithm_config.py +++ b/rllib/algorithms/tests/test_algorithm_config.py @@ -169,15 +169,7 @@ def test_detect_atari_env(self): self.assertFalse(config.is_atari) def test_rl_module_api(self): - config = ( - PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) - .environment("CartPole-v1") - .framework("torch") - ) + config = PPOConfig().environment("CartPole-v1").framework("torch") self.assertEqual(config.rl_module_spec.module_class, PPOTorchRLModule) @@ -231,14 +223,7 @@ def test_config_per_module(self): self.assertTrue(config_3 is config) def test_learner_api(self): - config = ( - PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) - .environment("CartPole-v1") - ) + config = PPOConfig().environment("CartPole-v1") self.assertEqual(config.learner_class, PPOTorchLearner) diff --git a/rllib/algorithms/tests/test_algorithm_rl_module_restore.py b/rllib/algorithms/tests/test_algorithm_rl_module_restore.py index b9979da368d3..a4b0f7720937 100644 --- a/rllib/algorithms/tests/test_algorithm_rl_module_restore.py +++ b/rllib/algorithms/tests/test_algorithm_rl_module_restore.py @@ -49,10 +49,6 @@ def policy_mapping_fn(agent_id, episode, **kwargs): config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners(rollout_fragment_length=4) .learners(**scaling_config) .environment(MultiAgentCartPole, env_config={"num_agents": num_agents}) @@ -184,10 +180,6 @@ def test_e2e_load_rl_module(self): config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners(rollout_fragment_length=4) .learners(**scaling_config) .environment("CartPole-v1") diff --git a/rllib/algorithms/tests/test_callbacks_old_api_stack.py b/rllib/algorithms/tests/test_callbacks_old_api_stack.py index 0d72cd7abceb..c96bde5a7c51 100644 --- a/rllib/algorithms/tests/test_callbacks_old_api_stack.py +++ b/rllib/algorithms/tests/test_callbacks_old_api_stack.py @@ -70,6 +70,10 @@ def tearDownClass(cls): def test_episode_and_sample_callbacks(self): config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment("CartPole-v1") .env_runners(num_env_runners=0) .callbacks(EpisodeAndSampleCallbacks) @@ -88,7 +92,12 @@ def test_episode_and_sample_callbacks(self): def test_on_sub_environment_created(self): config = ( - PPOConfig().environment("CartPole-v1") + PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) + .environment("CartPole-v1") # Create 4 sub-environments per remote worker. # Create 2 remote workers. .env_runners(num_envs_per_env_runner=4, num_env_runners=2) @@ -121,6 +130,10 @@ def test_on_sub_environment_created(self): def test_on_sub_environment_created_with_remote_envs(self): config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment("CartPole-v1") .env_runners( # Make each sub-environment a ray actor. @@ -162,6 +175,10 @@ def test_on_episode_created(self): # starts. config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment( RandomEnv, env_config={ diff --git a/rllib/algorithms/tests/test_callbacks_on_env_runner.py b/rllib/algorithms/tests/test_callbacks_on_env_runner.py index ae8443b5b811..b977022cec9f 100644 --- a/rllib/algorithms/tests/test_callbacks_on_env_runner.py +++ b/rllib/algorithms/tests/test_callbacks_on_env_runner.py @@ -94,10 +94,6 @@ def tearDownClass(cls): def test_episode_and_sample_callbacks_batch_mode_truncate_episodes(self): config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("CartPole-v1") .env_runners( num_env_runners=0, @@ -146,10 +142,6 @@ def test_episode_and_sample_callbacks_batch_mode_truncate_episodes(self): def test_episode_and_sample_callbacks_batch_mode_complete_episodes(self): config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("CartPole-v1") .env_runners( batch_mode="complete_episodes", @@ -195,23 +187,12 @@ def test_episode_and_sample_callbacks_batch_mode_complete_episodes(self): def test_overriding_on_episode_created_throws_error_on_new_api_stack(self): """Tests whether overriding `on_episode_created` raises error w/ SAEnvRunner.""" - config = ( - PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) - .callbacks(OnEpisodeCreatedCallback) - ) + config = PPOConfig().callbacks(OnEpisodeCreatedCallback) self.assertRaises(ValueError, lambda: config.validate()) def test_tune_trial_id_visible_in_callbacks(self): config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("multi_cart", env_config={"num_agents": 2}) .callbacks(OnEnvironmentCreatedCallback) .multi_agent( diff --git a/rllib/algorithms/tests/test_env_runner_failures.py b/rllib/algorithms/tests/test_env_runner_failures.py index 45308d1efaca..5fedec14eb0c 100644 --- a/rllib/algorithms/tests/test_env_runner_failures.py +++ b/rllib/algorithms/tests/test_env_runner_failures.py @@ -392,12 +392,7 @@ def _do_test_failing_recover(self, config, multi_agent=False): def test_fatal_single_agent(self): # Test the case where all workers fail (w/o recovery). self._do_test_failing_fatal( - PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) - .env_runners( + PPOConfig().env_runners( env_to_module_connector=lambda env: FlattenObservations(), ) ) @@ -405,12 +400,9 @@ def test_fatal_single_agent(self): def test_fatal_multi_agent(self): # Test the case where all workers fail (w/o recovery). self._do_test_failing_fatal( - PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) - .multi_agent(policies={"p0"}, policy_mapping_fn=lambda *a, **k: "p0"), + PPOConfig().multi_agent( + policies={"p0"}, policy_mapping_fn=lambda *a, **k: "p0" + ), ) def test_async_samples(self): @@ -436,10 +428,6 @@ def test_sync_replay(self): def test_multi_gpu(self): self._do_test_failing_ignore( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .training( train_batch_size=10, @@ -451,10 +439,6 @@ def test_multi_gpu(self): def test_sync_samples(self): self._do_test_failing_ignore( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .training(optimizer={}) ) @@ -471,10 +455,6 @@ def test_env_crash_during_sampling_but_restart_crashed_sub_envs(self): config = ( PPOConfig() - .api_stack( - enable_env_runner_and_connector_v2=True, - enable_rl_module_and_learner=True, - ) .env_runners(num_env_runners=4) .fault_tolerance( # Re-start failed individual sub-envs (then continue). @@ -520,10 +500,6 @@ def test_eval_workers_failing_ignore(self): # Test the case where one eval worker fails, but we chose to ignore. self._do_test_failing_ignore( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .training(model={"fcnet_hiddens": [4]}), fail_eval=True, @@ -533,10 +509,6 @@ def test_eval_workers_parallel_to_training_failing_recover(self): # Test the case where all eval workers fail, but we chose to recover. config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .evaluation( evaluation_num_env_runners=1, @@ -556,10 +528,6 @@ def test_eval_workers_parallel_to_training_multi_agent_failing_recover( # to recover. config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorkerMultiAgent) .multi_agent( policies={"main", "p0", "p1"}, @@ -595,10 +563,6 @@ def test_workers_failing_recover(self): config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorker, num_env_runners=2, @@ -654,10 +618,6 @@ def test_modules_are_restored_on_recovered_worker(self): config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorkerMultiAgent, num_env_runners=2, @@ -763,10 +723,6 @@ def test_eval_workers_failing_recover(self): config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorker, num_env_runners=2, @@ -904,7 +860,11 @@ def test_eval_workers_on_infinite_episodes(self): # horizon -> Expect warning and no proper evaluation results. config = ( PPOConfig() - .environment(env=RandomEnv, env_config={"p_terminated": 0.0}) + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) + .environment(RandomEnv, env_config={"p_terminated": 0.0}) .training(train_batch_size_per_learner=200) .evaluation( evaluation_num_env_runners=1, diff --git a/rllib/algorithms/tests/test_node_failures.py b/rllib/algorithms/tests/test_node_failures.py index dbac2e995f87..cd1ebbf0722c 100644 --- a/rllib/algorithms/tests/test_node_failures.py +++ b/rllib/algorithms/tests/test_node_failures.py @@ -53,10 +53,6 @@ def test_node_failure_ignore(self): # with fewer EnvRunners. config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("CartPole-v1") .env_runners( num_env_runners=6, @@ -74,10 +70,6 @@ def test_node_failure_recreate_env_runners(self): # We recreate failed EnvRunners and continue training. config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("CartPole-v1") .env_runners( num_env_runners=6, @@ -95,10 +87,6 @@ def test_node_failure_expect_crash(self): # We do not ignore EnvRunner failures and expect to crash upon failure. config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("CartPole-v1") .env_runners( num_env_runners=6, diff --git a/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py b/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py index 23c0cba79676..c65ba67ab43d 100644 --- a/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py +++ b/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py @@ -1,8 +1,12 @@ import argparse +import gymnasium as gym + from ray import tune, air from ray.air.constants import TRAINING_ITERATION from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack +from ray.tune.registry import register_env # Note: # To run this benchmark you need to have a ray cluster of at least @@ -26,10 +30,18 @@ def _parse_args(): def main(pargs): + # Register our environment with tune. + def _env_creator(cfg): + return wrap_atari_for_new_api_stack( + gym.make("ale_py:ALE/Breakout-v5", **cfg), framestack=4 + ) + + register_env("env", _env_creator) + config = ( PPOConfig() .environment( - "ale_py:ALE/Breakout-v5", + "env", clip_rewards=True, env_config={ "frameskip": 1, @@ -80,9 +92,9 @@ def main(pargs): results = tuner.fit() - compiled_throughput = results[0].metrics["num_env_steps_sampled_throughput_per_sec"] - eager_throughput = results[1].metrics["num_env_steps_sampled_throughput_per_sec"] - print(f"Speed up (%): {100 * (compiled_throughput / eager_throughput - 1)}") + compiled_timer = results[0].metrics["timers"]["env_runner_sampling_timer"] + eager_timer = results[1].metrics["timers"]["env_runner_sampling_timer"] + print(f"Speed up (%): {100 * (1 - compiled_timer / eager_timer)}") if __name__ == "__main__": diff --git a/rllib/connectors/tests/test_action.py b/rllib/connectors/tests/test_action.py index 8e1fc65af43d..92da301214d3 100644 --- a/rllib/connectors/tests/test_action.py +++ b/rllib/connectors/tests/test_action.py @@ -1,3 +1,5 @@ +# @OldAPIStack + import unittest import gymnasium as gym diff --git a/rllib/connectors/tests/test_agent.py b/rllib/connectors/tests/test_agent.py index 6deb2dc29077..cc1acab22588 100644 --- a/rllib/connectors/tests/test_agent.py +++ b/rllib/connectors/tests/test_agent.py @@ -1,3 +1,5 @@ +# @OldAPIStack + import gymnasium as gym from gymnasium.spaces import Box import numpy as np @@ -274,6 +276,7 @@ def test_vr_connector_respects_training_or_inference_vr_flags(self): data = AgentConnectorDataType(0, 1, agent_data) config = PPOConfig().to_dict() + config["_enable_new_api_stack"] = False ctx = ConnectorContext( view_requirements=view_rq_dict, config=config, @@ -300,8 +303,6 @@ def test_vr_connector_respects_training_or_inference_vr_flags(self): check(sample_batch, sample_batch_expected) def test_vr_connector_shift_by_one(self): - """Test that the ViewRequirementAgentConnector can handle shift by one correctly and - can ignore future referencing view_requirements to respect causality""" view_rq_dict = { "state": ViewRequirement("obs"), "next_state": ViewRequirement( @@ -312,6 +313,7 @@ def test_vr_connector_shift_by_one(self): obs_arrs = np.arange(10)[:, None] + 1 config = PPOConfig().to_dict() + config["_enable_new_api_stack"] = False ctx = ConnectorContext( view_requirements=view_rq_dict, config=config, is_policy_recurrent=True ) @@ -347,6 +349,7 @@ def test_vr_connector_causal_slice(self): obs_arrs = np.arange(10)[:, None] + 1 config = PPOConfig().to_dict() + config["_enable_new_api_stack"] = False ctx = ConnectorContext( view_requirements=view_rq_dict, config=config, is_policy_recurrent=True ) @@ -419,6 +422,7 @@ def test_vr_connector_with_multiple_buffers(self): act_arrs = (np.arange(10)[:, None] + 1) * 100 n_steps = obs_arrs.shape[0] config = PPOConfig().to_dict() + config["_enable_new_api_stack"] = False ctx = ConnectorContext( view_requirements=view_rq_dict, config=config, is_policy_recurrent=True ) @@ -460,9 +464,12 @@ def test_vr_connector_with_multiple_buffers(self): def test_connector_pipline_with_view_requirement(self): """A very minimal test that checks wheter pipeline connectors work in a simulation rollout.""" - # TODO: make this test beefier and more comprehensive config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .framework("torch") .environment(env="CartPole-v1") .env_runners(create_env_on_local_worker=True) @@ -560,6 +567,7 @@ def test_vr_connector_only_keeps_useful_timesteps(self): } config = PPOConfig().to_dict() + config["_enable_new_api_stack"] = False ctx = ConnectorContext( view_requirements=view_rqs, config=config, @@ -594,6 +602,7 @@ def test_vr_connector_default_agent_collector_is_empty(self): } config = PPOConfig().to_dict() + config["_enable_new_api_stack"] = False ctx = ConnectorContext( view_requirements=view_rqs, config=config, diff --git a/rllib/connectors/tests/test_connector.py b/rllib/connectors/tests/test_connector.py index 1226bd9ff7ef..2d1e5a18855c 100644 --- a/rllib/connectors/tests/test_connector.py +++ b/rllib/connectors/tests/test_connector.py @@ -1,3 +1,5 @@ +# @OldAPIStack + import unittest import gymnasium as gym diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 537a48417705..e2ce4db2f17c 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -280,16 +280,15 @@ def build(self) -> None: return # Build learner connector pipeline used on this Learner worker. - if self.config.enable_env_runner_and_connector_v2: - # TODO (sven): Figure out which space to provide here. For now, - # it doesn't matter, as the default connector piece doesn't use - # this information anyway. - # module_spec = self._module_spec.as_multi_rl_module_spec() - self._learner_connector = self.config.build_learner_connector( - input_observation_space=None, - input_action_space=None, - device=self._device, - ) + # TODO (sven): Figure out which space to provide here. For now, + # it doesn't matter, as the default connector piece doesn't use + # this information anyway. + # module_spec = self._module_spec.as_multi_rl_module_spec() + self._learner_connector = self.config.build_learner_connector( + input_observation_space=None, + input_action_space=None, + device=self._device, + ) # Build the module to be trained by this learner. self._module = self._make_module() @@ -1306,7 +1305,7 @@ def _update_from_batch_or_episodes( episodes = tree.flatten(episodes) # Call the learner connector. - if self._learner_connector is not None and episodes is not None: + if episodes is not None: # Call the learner connector pipeline. with self.metrics.log_time((ALL_MODULES, LEARNER_CONNECTOR_TIMER)): shared_data = {} @@ -1336,6 +1335,15 @@ def _update_from_batch_or_episodes( {next(iter(self.module.keys())): batch}, env_steps=len(batch) ) + # TODO (sven): Remove this leftover hack here for the situation in which we + # did not go through the learner connector. + # Options: + # a) Either also pass given batches through the learner connector (even if + # episodes is None). (preferred solution) + # b) Get rid of the option to pass in a batch altogether. + if episodes is None: + batch = self._convert_batch_type(batch) + # Check the MultiAgentBatch, whether our RLModule contains all ModuleIDs # found in this batch. If not, throw an error. unknown_module_ids = set(batch.policy_batches.keys()) - set(self.module.keys()) @@ -1375,11 +1383,6 @@ def _update_from_batch_or_episodes( # `minibatch_size` and `num_epochs` are not set by the user. batch_iter = MiniBatchDummyIterator - # Convert input batch into a tensor batch (MultiAgentBatch) on the correct - # device (e.g. GPU). We move the batch already here to avoid having to move - # every single minibatch that is created in the `batch_iter` below. - if self._learner_connector is None: - batch = self._convert_batch_type(batch) batch = self._set_slicing_by_batch_id(batch, value=True) for tensor_minibatch in batch_iter( diff --git a/rllib/core/learner/tests/test_learner.py b/rllib/core/learner/tests/test_learner.py index de8e700629eb..884d0b60faf6 100644 --- a/rllib/core/learner/tests/test_learner.py +++ b/rllib/core/learner/tests/test_learner.py @@ -37,8 +37,9 @@ def test_end_to_end_update(self): min_loss = float("inf") for iter_i in range(1000): - batch = reader.next() - results = learner.update_from_batch(batch=batch.as_multi_agent()) + batch = reader.next().as_multi_agent() + batch = learner._convert_batch_type(batch) + results = learner.update_from_batch(batch=batch) loss = results[DEFAULT_MODULE_ID][Learner.TOTAL_LOSS_KEY].peek() min_loss = min(loss, min_loss) diff --git a/rllib/core/learner/tests/test_learner_group.py b/rllib/core/learner/tests/test_learner_group.py index 98300ade03df..71b6ff904619 100644 --- a/rllib/core/learner/tests/test_learner_group.py +++ b/rllib/core/learner/tests/test_learner_group.py @@ -28,6 +28,7 @@ from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader from ray.rllib.utils.metrics import ALL_MODULES from ray.rllib.utils.metrics.metrics_logger import MetricsLogger +from ray.rllib.utils.torch_utils import convert_to_torch_tensor from ray.util.timer import _Timer @@ -48,130 +49,6 @@ } -# TODO(avnishn) Make this a ray task later. Currently thats not possible because the -# task is not dying after the test is done. This is a bug with ray core. -@ray.remote(num_gpus=1) -class RemoteTrainingHelper: - def local_training_helper(self, fw, scaling_mode) -> None: - if fw == "torch": - import torch - - torch.manual_seed(0) - else: - raise NotImplementedError - - env = gym.make("CartPole-v1") - - reader = get_cartpole_dataset_reader(batch_size=500) - batch = reader.next().as_multi_agent() - - config_overrides = LOCAL_CONFIGS[scaling_mode] - config = BaseTestingAlgorithmConfig().update_from_dict(config_overrides) - - learner_group = config.build_learner_group(env=env) - local_learner = config.build_learner(env=env) - - # Make the state of the learner and the local learner_group identical. - local_learner.set_state(learner_group.get_state()[COMPONENT_LEARNER]) - check(local_learner.get_state(), learner_group.get_state()[COMPONENT_LEARNER]) - - # Update and check state again. - learner_update = local_learner.update_from_batch(batch=batch) - learner_update = MetricsLogger.peek_results(learner_update) - learner_group_update = learner_group.update_from_batch(batch=batch) - check(learner_update, learner_group_update) - check(local_learner.get_state(), learner_group.get_state()[COMPONENT_LEARNER]) - - new_module_id = "test_module" - - add_module_to_learner_or_learner_group( - config, env, new_module_id, learner_group - ) - add_module_to_learner_or_learner_group( - config, env, new_module_id, local_learner - ) - - # make the state of the learner and the local learner_group identical - local_learner.set_state(learner_group.get_state()[COMPONENT_LEARNER]) - check(local_learner.get_state(), learner_group.get_state()[COMPONENT_LEARNER]) - - # Do another update. - batch = reader.next() - ma_batch = MultiAgentBatch( - {new_module_id: batch, DEFAULT_MODULE_ID: batch}, env_steps=batch.count - ) - # the optimizer state is not initialized fully until the first time that - # training is completed. A call to get state before that won't contain the - # optimizer state. So we do a dummy update here to initialize the optimizer - l0 = local_learner.get_state() - local_learner.update_from_batch(batch=ma_batch) - l1 = local_learner.get_state() - check( - l0["rl_module"]["default_policy"]["policy.0.bias"], - l1["rl_module"]["default_policy"]["policy.0.bias"], - false=True, - ) - check( - l0["rl_module"]["test_module"]["policy.0.bias"], - l1["rl_module"]["test_module"]["policy.0.bias"], - false=True, - ) - check( - l0["optimizer"]["default_policy_default_optimizer"]["state"][0]["exp_avg"], - l1["optimizer"]["default_policy_default_optimizer"]["state"][0]["exp_avg"], - false=True, - ) - check( - l0["optimizer"]["test_module_default_optimizer"]["state"], - {}, - ) - - lg0 = learner_group.get_state()[COMPONENT_LEARNER] - check(l0, lg0) - - learner_group.update_from_batch(batch=ma_batch) - lg1 = learner_group.get_state()[COMPONENT_LEARNER] - - check( - lg0["rl_module"]["default_policy"]["policy.0.bias"], - lg1["rl_module"]["default_policy"]["policy.0.bias"], - false=True, - ) - check( - lg0["rl_module"]["test_module"]["policy.0.bias"], - lg1["rl_module"]["test_module"]["policy.0.bias"], - false=True, - ) - check( - lg0["optimizer"]["default_policy_default_optimizer"]["state"][0]["exp_avg"], - lg1["optimizer"]["default_policy_default_optimizer"]["state"][0]["exp_avg"], - false=True, - ) - check( - lg0["optimizer"]["test_module_default_optimizer"]["state"], - {}, - ) - - check(l1["rl_module"]["test_module"], lg1["rl_module"]["test_module"]) - check( - l1["optimizer"]["test_module_default_optimizer"], - lg1["optimizer"]["test_module_default_optimizer"], - ) - # check(l1["rl_module"]["default_policy"], lg1["rl_module"]["default_policy"]) - - # local_learner.update_from_batch(batch=ma_batch) - # learner_group.update_from_batch(batch=ma_batch) - - # check(local_learner.get_state(), learner_group.get_state()[COMPONENT_LEARNER]) - # local_learner_results = local_learner.update_from_batch(batch=ma_batch) - # local_learner_results = MetricsLogger.peek_results(local_learner_results) - # learner_group_results = learner_group.update_from_batch(batch=ma_batch) - - # check(local_learner_results, learner_group_results) - - # check(local_learner.get_state(), learner_group.get_state()[COMPONENT_LEARNER]) - - class TestLearnerGroupSyncUpdate(unittest.TestCase): @classmethod def setUpClass(cls) -> None: @@ -207,20 +84,6 @@ def test_learner_group_build_from_algorithm_config(self): print(learner_group) learner_group.shutdown() - # def test_learner_group_local(self): - # fws = ["torch"] - - # test_iterator = itertools.product(fws, LOCAL_CONFIGS) - - # # run the logic of this test inside of a ray actor because we want tensorflow - # # resources to be gracefully released. Tensorflow blocks the gpu resources - # # otherwise between test cases, causing a gpu oom error. - # for fw, scaling_mode in test_iterator: - # print(f"Testing framework: {fw}, scaling_mode: {scaling_mode}") - # training_helper = RemoteTrainingHelper.remote() - # ray.get(training_helper.local_training_helper.remote(fw, scaling_mode)) - # del training_helper - def test_update_multi_gpu(self): return @@ -239,8 +102,8 @@ def test_update_multi_gpu(self): min_loss = float("inf") for iter_i in range(1000): - batch = reader.next() - results = learner_group.update_from_batch(batch=batch.as_multi_agent()) + batch = convert_to_torch_tensor(reader.next().as_multi_agent()) + results = learner_group.update_from_batch(batch=batch) loss = np.mean( [res[ALL_MODULES][Learner.TOTAL_LOSS_KEY] for res in results] @@ -279,7 +142,7 @@ def test_add_module_and_remove_module(self): config = BaseTestingAlgorithmConfig().update_from_dict(config_overrides) learner_group = config.build_learner_group(env=env) reader = get_cartpole_dataset_reader(batch_size=512) - batch = reader.next() + batch = convert_to_torch_tensor(reader.next()) # Update once with the default policy. learner_group.update_from_batch(batch.as_multi_agent()) @@ -451,7 +314,8 @@ def test_save_to_path_and_restore_from_path(self): # this is expanded to more scaling modes on the release ci. scaling_modes = ["local-cpu", "multi-gpu-ddp"] test_iterator = itertools.product(fws, scaling_modes) - batch = SampleBatch(FAKE_BATCH) + batch = SampleBatch(convert_to_torch_tensor(FAKE_BATCH)).as_multi_agent() + for fw, scaling_mode in test_iterator: print(f"Testing framework: {fw}, scaling mode: {scaling_mode}.") env = gym.make("CartPole-v1") @@ -469,7 +333,7 @@ def test_save_to_path_and_restore_from_path(self): initial_weights = learner_group.get_weights() # Do a single update. - learner_group.update_from_batch(batch.as_multi_agent()) + learner_group.update_from_batch(batch) weights_after_update = learner_group.get_state( components=COMPONENT_LEARNER + "/" + COMPONENT_RL_MODULE )[COMPONENT_LEARNER][COMPONENT_RL_MODULE] @@ -490,9 +354,7 @@ def test_save_to_path_and_restore_from_path(self): learner_group.restore_from_path(learner_after_1_update_checkpoint_dir) # Do another update. - results_2nd_update_with_break = learner_group.update_from_batch( - batch=batch.as_multi_agent() - ) + results_2nd_update_with_break = learner_group.update_from_batch(batch=batch) weights_after_2_updates_with_break = learner_group.get_state( components=COMPONENT_LEARNER + "/" + COMPONENT_RL_MODULE )[COMPONENT_LEARNER][COMPONENT_RL_MODULE] @@ -509,10 +371,8 @@ def test_save_to_path_and_restore_from_path(self): weights_after_restore.pop(COMPONENT_MULTI_RL_MODULE_SPEC) check(initial_weights, weights_after_restore) # Perform 2 updates to get to the same state as the previous learners. - learner_group.update_from_batch(batch.as_multi_agent()) - results_2nd_without_break = learner_group.update_from_batch( - batch=batch.as_multi_agent() - ) + learner_group.update_from_batch(batch) + results_2nd_without_break = learner_group.update_from_batch(batch=batch) weights_after_2_updates_without_break = learner_group.get_weights() learner_group.shutdown() del learner_group diff --git a/rllib/core/models/tests/test_catalog.py b/rllib/core/models/tests/test_catalog.py index f745f8d570a7..d201c60d5ab5 100644 --- a/rllib/core/models/tests/test_catalog.py +++ b/rllib/core/models/tests/test_catalog.py @@ -322,10 +322,6 @@ def build_vf_head(self, framework): config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .rl_module( rl_module_spec=RLModuleSpec(catalog_class=MyCatalog), ) diff --git a/rllib/core/rl_module/default_model_config.py b/rllib/core/rl_module/default_model_config.py index ecd9e4b9b906..3a8e1da15a76 100644 --- a/rllib/core/rl_module/default_model_config.py +++ b/rllib/core/rl_module/default_model_config.py @@ -44,8 +44,8 @@ class DefaultModelConfig: from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig config = ( - PPOConfig(). - rl_module( + PPOConfig() + .rl_module( model_config=DefaultModelConfig(fcnet_hiddens=[32, 32]), ) ) diff --git a/rllib/core/rl_module/rl_module.py b/rllib/core/rl_module/rl_module.py index 42aa0a780ed4..d429eb7f7bca 100644 --- a/rllib/core/rl_module/rl_module.py +++ b/rllib/core/rl_module/rl_module.py @@ -1,6 +1,7 @@ import abc import dataclasses from dataclasses import dataclass, field +import logging from typing import Any, Collection, Dict, Optional, Type, TYPE_CHECKING, Union import gymnasium as gym @@ -36,6 +37,8 @@ ) from ray.rllib.core.models.catalog import Catalog +logger = logging.getLogger("ray.rllib") + @PublicAPI(stability="alpha") @dataclass @@ -395,6 +398,7 @@ def __init__( # TODO (sven): Deprecate Catalog and replace with utility functions to create # primitive components based on obs- and action spaces. self.catalog = None + self._catalog_ctor_error = None # Deprecated self.config = config @@ -403,17 +407,10 @@ def __init__( old="RLModule(config=[RLModuleConfig])", new="RLModule(observation_space=.., action_space=.., inference_only=..," " learner_only=.., model_config=..)", - error=False, + help="See https://github.com/ray-project/ray/blob/master/rllib/examples/rl_modules/custom_cnn_rl_module.py " # noqa + "for how to write a custom RLModule.", + error=True, ) - self.observation_space = self.config.observation_space - self.action_space = self.config.action_space - self.inference_only = self.config.inference_only - self.learner_only = self.config.learner_only - self.model_config = self.config.model_config_dict - try: - self.catalog = self.config.get_catalog() - except Exception: - pass else: self.observation_space = observation_space self.action_space = action_space @@ -426,8 +423,16 @@ def __init__( action_space=self.action_space, model_config_dict=self.model_config, ) - except Exception: - pass + except Exception as e: + logger.warning( + "Could not create a Catalog object for your RLModule! If you are " + "not using the new API stack yet, make sure to switch it off in " + "your config: `config.api_stack(enable_rl_module_and_learner=False" + ", enable_env_runner_and_connector_v2=False)`. Some algos already " + "use the new stack by default. Ignore this message, if your " + "RLModule does not use a Catalog to build its sub-components." + ) + self._catalog_ctor_error = e # TODO (sven): Deprecate this. We keep it here for now in case users # still have custom models (or subclasses of RLlib default models) @@ -693,7 +698,7 @@ def set_state(self, state: StateDict) -> None: @override(Checkpointable) def get_ctor_args_and_kwargs(self): return ( - (self.config,), # *args + (), # *args { "observation_space": self.observation_space, "action_space": self.action_space, diff --git a/rllib/core/rl_module/tf/tests/test_tf_rl_module.py b/rllib/core/rl_module/tf/tests/test_tf_rl_module.py index 9e1c43faa836..2b3a7bb0a9e5 100644 --- a/rllib/core/rl_module/tf/tests/test_tf_rl_module.py +++ b/rllib/core/rl_module/tf/tests/test_tf_rl_module.py @@ -5,7 +5,6 @@ import tensorflow as tf from ray.rllib.core.columns import Columns -from ray.rllib.core.rl_module.rl_module import RLModuleConfig from ray.rllib.core.rl_module.tf.tf_rl_module import TfRLModule from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule from ray.rllib.utils.test_utils import check @@ -16,11 +15,9 @@ def test_compilation(self): env = gym.make("CartPole-v1") module = DiscreteBCTFModule( - config=RLModuleConfig( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, - ) + observation_space=env.observation_space, + action_space=env.action_space, + model_config={"fcnet_hiddens": [32]}, ) self.assertIsInstance(module, TfRLModule) @@ -63,11 +60,9 @@ def test_forward(self): env = gym.make("CartPole-v1") module = DiscreteBCTFModule( - config=RLModuleConfig( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, - ) + observation_space=env.observation_space, + action_space=env.action_space, + model_config={"fcnet_hiddens": [32]}, ) obs_shape = env.observation_space.shape @@ -81,22 +76,18 @@ def test_get_set_state(self): env = gym.make("CartPole-v1") module = DiscreteBCTFModule( - config=RLModuleConfig( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, - ) + observation_space=env.observation_space, + action_space=env.action_space, + model_config={"fcnet_hiddens": [32]}, ) state = module.get_state() self.assertIsInstance(state, dict) module2 = DiscreteBCTFModule( - config=RLModuleConfig( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, - ) + observation_space=env.observation_space, + action_space=env.action_space, + model_config={"fcnet_hiddens": [32]}, ) state2 = module2.get_state() check(state["policy"][0], state2["policy"][0], false=True) diff --git a/rllib/env/policy_server_input.py b/rllib/env/policy_server_input.py index c2e5e75aebf5..eedbe224e631 100644 --- a/rllib/env/policy_server_input.py +++ b/rllib/env/policy_server_input.py @@ -49,6 +49,10 @@ class PolicyServerInput(ThreadingMixIn, HTTPServer, InputReader): addr, port = ... config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment("CartPole-v1") .offline_data( input_=lambda ioctx: PolicyServerInput(ioctx, addr, port) diff --git a/rllib/env/tests/test_multi_agent_env.py b/rllib/env/tests/test_multi_agent_env.py index 98caf12c57fa..31d4c9ea13cc 100644 --- a/rllib/env/tests/test_multi_agent_env.py +++ b/rllib/env/tests/test_multi_agent_env.py @@ -678,6 +678,10 @@ def test_multi_agent_with_flex_agents(self): register_env("flex_agents_multi_agent", lambda _: FlexAgentsMultiAgent()) config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("flex_agents_multi_agent") .env_runners(num_env_runners=0) .training(train_batch_size=50, minibatch_size=50, num_epochs=1) @@ -700,6 +704,10 @@ def test_multi_agent_with_sometimes_zero_agents_observing(self): ) config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment("sometimes_zero_agents") .env_runners(num_env_runners=0) ) diff --git a/rllib/env/tests/test_multi_agent_env_runner.py b/rllib/env/tests/test_multi_agent_env_runner.py index 26136fb37aca..acfaa647bd70 100644 --- a/rllib/env/tests/test_multi_agent_env_runner.py +++ b/rllib/env/tests/test_multi_agent_env_runner.py @@ -94,12 +94,7 @@ def test_sample_episodes(self): def _build_config(self): # Build the configuration and use `PPO`. config = ( - PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) - .environment( + PPOConfig().environment( MultiAgentCartPole, env_config={"num_agents": 2}, ) diff --git a/rllib/evaluation/tests/test_env_runner_v2.py b/rllib/evaluation/tests/test_env_runner_v2.py index d5d139f385a7..05f05c495961 100644 --- a/rllib/evaluation/tests/test_env_runner_v2.py +++ b/rllib/evaluation/tests/test_env_runner_v2.py @@ -52,6 +52,10 @@ def tearDownClass(cls): def test_sample_batch_rollout_single_agent_env(self): config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment(DebugCounterEnv) .framework("torch") .training( @@ -77,6 +81,10 @@ def test_sample_batch_rollout_single_agent_env(self): def test_sample_batch_rollout_multi_agent_env(self): config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment("basic_multiagent") .framework("torch") .training( @@ -144,6 +152,10 @@ def compute_actions( config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .framework("torch") .environment("env_under_test") .env_runners( @@ -205,6 +217,10 @@ def __init__(self, *args, **kwargs): config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment("basic_multiagent") .framework("torch") .training( @@ -274,6 +290,10 @@ def on_create_policy(self, *, policy_id, policy) -> None: config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment("basic_multiagent") .framework("torch") .training( @@ -298,6 +318,10 @@ def on_create_policy(self, *, policy_id, policy) -> None: def test_start_episode(self): config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment("basic_multiagent") .framework("torch") .training( @@ -352,6 +376,10 @@ def test_env_runner_output(self): # Test if we can produce RolloutMetrics just by stepping config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment("basic_multiagent") .framework("torch") .training( @@ -409,6 +437,10 @@ def on_episode_end( # Test if we can produce RolloutMetrics just by stepping config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment("basic_multiagent") .framework("torch") .training( diff --git a/rllib/evaluation/tests/test_rollout_worker.py b/rllib/evaluation/tests/test_rollout_worker.py index d52529d1e632..f371ba942d5f 100644 --- a/rllib/evaluation/tests/test_rollout_worker.py +++ b/rllib/evaluation/tests/test_rollout_worker.py @@ -172,6 +172,10 @@ def test_batch_ids(self): def test_global_vars_update(self): config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment("CartPole-v1") .env_runners(num_envs_per_env_runner=1) # lr = 0.1 - [(0.1 - 0.000001) / 100000] * ts @@ -202,6 +206,10 @@ def test_query_evaluators(self): register_env("test", lambda _: gym.make("CartPole-v1")) config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment("test") .env_runners( num_env_runners=2, diff --git a/rllib/examples/_docs/rllib_on_rllib_readme.py b/rllib/examples/_docs/rllib_on_rllib_readme.py index 5e1090153dfc..4463eba4ce85 100644 --- a/rllib/examples/_docs/rllib_on_rllib_readme.py +++ b/rllib/examples/_docs/rllib_on_rllib_readme.py @@ -1,4 +1,7 @@ import gymnasium as gym +import numpy as np +import torch + from ray.rllib.algorithms.ppo import PPOConfig from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, @@ -21,7 +24,7 @@ class ParrotEnv(gym.Env): def __init__(self, config): # Make the space (for actions and observations) configurable. self.action_space = config.get( - "parrot_shriek_range", gym.spaces.Box(-1.0, 1.0, shape=(1,)) + "parrot_shriek_range", gym.spaces.Box(-1.0, 1.0, (1,), np.float32) ) # Since actions should repeat observations, their spaces must be the # same. @@ -45,12 +48,12 @@ def step(self, action): """ # Set `done` and `truncated` flags after 10 steps. self.episode_len += 1 - done = truncated = self.episode_len >= 10 + terminated = truncated = self.episode_len >= 10 # r = -abs(obs - action) reward = -sum(abs(self.cur_obs - action)) # Set a new observation (random sample). self.cur_obs = self.observation_space.sample() - return self.cur_obs, reward, done, truncated, {} + return self.cur_obs, reward, terminated, truncated, {} # Create an RLlib Algorithm instance from a PPOConfig to learn how to @@ -88,7 +91,10 @@ def step(self, action): while not done: # Compute a single action, given the current observation # from the environment. - action = algo.compute_single_action(obs) + model_outputs = algo.env_runner.module.forward_inference( + {"obs": torch.from_numpy(obs)} + ) + action = model_outputs["action_dist_inputs"][0].numpy() # Apply the computed action in the environment. obs, reward, done, truncated, info = env.step(action) # Sum up rewards for reporting purposes. diff --git a/rllib/examples/autoregressive_action_dist.py b/rllib/examples/autoregressive_action_dist.py index 5dfac509e580..241b6a19429d 100644 --- a/rllib/examples/autoregressive_action_dist.py +++ b/rllib/examples/autoregressive_action_dist.py @@ -148,7 +148,10 @@ def get_cli_args(): get_trainable_cls(args.run) .get_default_config() # Batch-norm models have not been migrated to the RL Module API yet. - .api_stack(enable_rl_module_and_learner=False) + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment(AutoRegressiveActionEnv) .framework(args.framework) .training(gamma=0.5) diff --git a/rllib/examples/catalogs/mobilenet_v2_encoder.py b/rllib/examples/catalogs/mobilenet_v2_encoder.py index 93d85bcd7633..7b76fb227f2c 100644 --- a/rllib/examples/catalogs/mobilenet_v2_encoder.py +++ b/rllib/examples/catalogs/mobilenet_v2_encoder.py @@ -44,10 +44,6 @@ def _get_encoder_config( # Create a generic config with our enhanced Catalog ppo_config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .rl_module(rl_module_spec=RLModuleSpec(catalog_class=MobileNetEnhancedPPOCatalog)) .env_runners(num_env_runners=0) # The following training settings make it so that a training iteration is very diff --git a/rllib/examples/centralized_critic.py b/rllib/examples/centralized_critic.py index 0cbe110810cf..01b274e92477 100644 --- a/rllib/examples/centralized_critic.py +++ b/rllib/examples/centralized_critic.py @@ -269,6 +269,10 @@ def get_default_policy_class(cls, config): config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment(TwoStepGame) .framework(args.framework) .env_runners(batch_mode="complete_episodes", num_env_runners=0) diff --git a/rllib/examples/checkpoints/cartpole_dqn_export.py b/rllib/examples/checkpoints/cartpole_dqn_export.py index 48e73f15b6ae..86a623d012d9 100644 --- a/rllib/examples/checkpoints/cartpole_dqn_export.py +++ b/rllib/examples/checkpoints/cartpole_dqn_export.py @@ -18,6 +18,10 @@ def train_and_export_policy_and_model(algo_name, num_steps, model_dir, ckpt_dir): cls = get_trainable_cls(algo_name) config = cls.get_default_config() + config.api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) # This Example is only for tf. config.framework("tf") # Set exporting native (DL-framework) model files to True. diff --git a/rllib/examples/checkpoints/onnx_tf.py b/rllib/examples/checkpoints/onnx_tf.py index 65d83fb095c3..19fb7f376032 100644 --- a/rllib/examples/checkpoints/onnx_tf.py +++ b/rllib/examples/checkpoints/onnx_tf.py @@ -23,7 +23,15 @@ args = parser.parse_args() # Configure our PPO Algorithm. - config = ppo.PPOConfig().env_runners(num_env_runners=1).framework(args.framework) + config = ( + ppo.PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .env_runners(num_env_runners=1) + .framework(args.framework) + ) outdir = "export_tf" if os.path.exists(outdir): diff --git a/rllib/examples/checkpoints/onnx_torch.py b/rllib/examples/checkpoints/onnx_torch.py index f718fffb7c8a..b7d39cc9225a 100644 --- a/rllib/examples/checkpoints/onnx_torch.py +++ b/rllib/examples/checkpoints/onnx_torch.py @@ -11,7 +11,15 @@ if __name__ == "__main__": # Configure our PPO Algorithm. - config = ppo.PPOConfig().env_runners(num_env_runners=1).framework("torch") + config = ( + ppo.PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .env_runners(num_env_runners=1) + .framework("torch") + ) outdir = "export_torch" if os.path.exists(outdir): diff --git a/rllib/examples/custom_recurrent_rnn_tokenizer.py b/rllib/examples/custom_recurrent_rnn_tokenizer.py deleted file mode 100644 index fd7bab9edab5..000000000000 --- a/rllib/examples/custom_recurrent_rnn_tokenizer.py +++ /dev/null @@ -1,188 +0,0 @@ -# @OldAPIStack - -"""Example of defining custom tokenizers for recurrent models in RLModules. - -This example shows the following steps: -- Define a custom tokenizer for a recurrent encoder. -- Define a model config that builds the custom tokenizer. -- Modify the default PPOCatalog to use the custom tokenizer config. -- Run a training that uses the custom tokenizer. -""" - -import argparse -import os - -import ray -from ray import air, tune -from ray.air.constants import TRAINING_ITERATION -from ray.tune.registry import register_env -from ray.rllib.examples.envs.classes.repeat_after_me_env import RepeatAfterMeEnv -from ray.rllib.examples.envs.classes.repeat_initial_obs_env import RepeatInitialObsEnv -from ray.rllib.core.rl_module.rl_module import RLModuleSpec -from ray.rllib.policy.sample_batch import SampleBatch -from dataclasses import dataclass -from ray.rllib.core.models.base import Encoder, ENCODER_OUT -from ray.rllib.core.models.torch.base import TorchModel -from ray.rllib.core.models.tf.base import TfModel -from ray.rllib.algorithms.ppo.ppo import PPOConfig -from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.rllib.core.models.configs import ModelConfig - -parser = argparse.ArgumentParser() - -tf1, tf, tfv = try_import_tf() -torch, nn = try_import_torch() - -parser.add_argument("--env", type=str, default="RepeatAfterMeEnv") -parser.add_argument("--num-cpus", type=int, default=0) -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=100, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", type=float, default=90.0, help="Reward at which we stop training." -) -parser.add_argument( - "--local-mode", - action="store_true", - help="Init Ray in local mode for easier debugging.", -) - -# We first define a custom tokenizer that we want to use to encode the -# observations before they are passed into the recurrent cells. -# We do this step for tf and for torch here to make the following steps framework- -# agnostic. However, if you use only one framework, you can skip the other one. - - -class CustomTorchTokenizer(TorchModel, Encoder): - def __init__(self, config) -> None: - TorchModel.__init__(self, config) - Encoder.__init__(self, config) - self.net = nn.Sequential( - nn.Linear(config.input_dims[0], config.output_dims[0]), - ) - - def _forward(self, inputs: dict, **kwargs): - return {ENCODER_OUT: self.net(inputs[SampleBatch.OBS])} - - -class CustomTfTokenizer(TfModel, Encoder): - def __init__(self, config) -> None: - TfModel.__init__(self, config) - Encoder.__init__(self, config) - - self.net = tf.keras.models.Sequential( - [ - tf.keras.layers.Input(shape=config.input_dims), - tf.keras.layers.Dense(config.output_dims[0], activation="relu"), - ] - ) - - def _forward(self, inputs: dict, **kwargs): - return {ENCODER_OUT: self.net(inputs[SampleBatch.OBS])} - - -# Since RLlib decides during runtime which framework we use, we need to provide a -# model config that is buildable depending on the framework. The recurrent models -# will consume this config during runtime and build our custom tokenizer accordingly. - - -@dataclass -class CustomTokenizerConfig(ModelConfig): - output_dims: tuple = None - - def build(self, framework): - if framework == "torch": - return CustomTorchTokenizer(self) - else: - return CustomTfTokenizer(self) - - -# We now modify the default Catalog for PPO to inject our config. -# Alternatively, we could inherit from the PPO RLModule here, which is more -# straightforward if we want to completely replace -# the default models. However, we want to keep RLlib's default LSTM Encoder and only -# place our tokenizer inside of it, so we use the Catalog here for demonstration -# purposes. - - -class CustomPPOCatalog(PPOCatalog): - # Note that RLlib expects this to be a classmethod. - @classmethod - def get_tokenizer_config( - cls, - observation_space, - model_config_dict, - view_requirements=None, - ) -> ModelConfig: - return CustomTokenizerConfig( - input_dims=observation_space.shape, - output_dims=(64,), - ) - - -if __name__ == "__main__": - args = parser.parse_args() - - ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) - register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c)) - register_env("RepeatInitialObsEnv", lambda _: RepeatInitialObsEnv()) - - config = ( - PPOConfig() - .environment(args.env, env_config={"repeat_delay": 2}) - .framework(args.framework) - .env_runners(num_env_runners=0, num_envs_per_env_runner=20) - .training( - model={ - "vf_share_layers": False, - "use_lstm": True, - "lstm_cell_size": 256, - "fcnet_hiddens": [256], - }, - gamma=0.9, - entropy_coeff=0.001, - vf_loss_coeff=1e-5, - ) - .rl_module(rl_module_spec=RLModuleSpec(catalog_class=CustomPPOCatalog)) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) - ) - - stop = { - TRAINING_ITERATION: args.stop_iters, - NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, - } - - tuner = tune.Tuner( - "PPO", - param_space=config.to_dict(), - run_config=air.RunConfig(stop=stop, verbose=1), - ) - results = tuner.fit() - - if args.as_test: - check_learning_achieved(results, args.stop_reward) - ray.shutdown() diff --git a/rllib/examples/debugging/deterministic_training.py b/rllib/examples/debugging/deterministic_training.py index 5ef6ee1a0167..9e7a8960c56e 100644 --- a/rllib/examples/debugging/deterministic_training.py +++ b/rllib/examples/debugging/deterministic_training.py @@ -36,6 +36,10 @@ config = ( get_trainable_cls(args.run) .get_default_config() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .environment( CartPoleWithRemoteParamServer, env_config={"param_server": "param-server"}, diff --git a/rllib/examples/hierarchical/hierarchical_training.py b/rllib/examples/hierarchical/hierarchical_training.py index 924aa5de2f07..ccdc067fe3ae 100644 --- a/rllib/examples/hierarchical/hierarchical_training.py +++ b/rllib/examples/hierarchical/hierarchical_training.py @@ -91,6 +91,10 @@ run_config=air.RunConfig(stop=stop), param_space=( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment(WindyMazeEnv) .env_runners(num_env_runners=0) .framework(args.framework) @@ -107,6 +111,10 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment(HierarchicalWindyMazeEnv) .framework(args.framework) .env_runners(num_env_runners=0) diff --git a/rllib/examples/inference/policy_inference_after_training.py b/rllib/examples/inference/policy_inference_after_training.py index b462263a5dab..4ece833c3c53 100644 --- a/rllib/examples/inference/policy_inference_after_training.py +++ b/rllib/examples/inference/policy_inference_after_training.py @@ -138,7 +138,7 @@ ) # Create new RLModule and restore its state from the last algo checkpoint. # Note that the checkpoint for the RLModule can be found deeper inside the algo - # checkpoint's sub-directories ([algo dir] -> "learner/" -> "module_state/" -> + # checkpoint's subdirectories ([algo dir] -> "learner/" -> "module_state/" -> # "[module ID]): rl_module = RLModule.from_checkpoint( os.path.join( diff --git a/rllib/examples/inference/policy_inference_after_training_with_attention.py b/rllib/examples/inference/policy_inference_after_training_with_attention.py index 07779e3bc0b2..1e594066d18f 100644 --- a/rllib/examples/inference/policy_inference_after_training_with_attention.py +++ b/rllib/examples/inference/policy_inference_after_training_with_attention.py @@ -84,6 +84,10 @@ config = ( get_trainable_cls(args.run) .get_default_config() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("FrozenLake-v1") # Run with tracing enabled for tf2? .framework(args.framework) diff --git a/rllib/examples/inference/policy_inference_after_training_with_lstm.py b/rllib/examples/inference/policy_inference_after_training_with_lstm.py index a7dc5ada6f3c..39c6ac6aa588 100644 --- a/rllib/examples/inference/policy_inference_after_training_with_lstm.py +++ b/rllib/examples/inference/policy_inference_after_training_with_lstm.py @@ -82,6 +82,10 @@ config = ( get_trainable_cls(args.run) .get_default_config() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("FrozenLake-v1") # Run with tracing enabled for tf2? .framework(args.framework) diff --git a/rllib/examples/learners/custom_loss_fn_simple.py b/rllib/examples/learners/custom_loss_fn_simple.py index 9877fa10cddf..aa50db615977 100644 --- a/rllib/examples/learners/custom_loss_fn_simple.py +++ b/rllib/examples/learners/custom_loss_fn_simple.py @@ -112,10 +112,6 @@ class for details on how to override the main (PPO) loss function. base_config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("CartPole-v1") .training( # This is the most important setting in this script: We point our PPO diff --git a/rllib/examples/metrics/custom_metrics_in_env_runners.py b/rllib/examples/metrics/custom_metrics_in_env_runners.py index cba86a50afb6..6c69bdc5746e 100644 --- a/rllib/examples/metrics/custom_metrics_in_env_runners.py +++ b/rllib/examples/metrics/custom_metrics_in_env_runners.py @@ -2,8 +2,8 @@ We use the `MetricsLogger` class, which RLlib provides inside all its components (only when using the new API stack through -`config.api_stack(_enable_rl_module_and_learner=True, -_enable_env_runner_and_connector_v2=True)`), +`config.api_stack(enable_rl_module_and_learner=True, +enable_env_runner_and_connector_v2=True)`), and which offers a unified API to log individual values per iteration, per episode timestep, per episode (as a whole), per loss call, etc.. `MetricsLogger` objects are available in all custom API code, for example inside your diff --git a/rllib/examples/offline_rl/train_w_bc_finetune_w_ppo.py b/rllib/examples/offline_rl/train_w_bc_finetune_w_ppo.py index 348dfb2af142..979d47562cca 100644 --- a/rllib/examples/offline_rl/train_w_bc_finetune_w_ppo.py +++ b/rllib/examples/offline_rl/train_w_bc_finetune_w_ppo.py @@ -264,10 +264,6 @@ def compute_values(self, batch, embeddings=None): # Create a new PPO config. base_config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment(args.env) .training( # Keep lr relatively low at the beginning to avoid catastrophic forgetting. diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py index 779c5c1fd041..66ce75c11eb6 100644 --- a/rllib/examples/ray_tune/custom_experiment.py +++ b/rllib/examples/ray_tune/custom_experiment.py @@ -153,15 +153,7 @@ def my_experiment(config: Dict): if __name__ == "__main__": - base_config = ( - PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) - .environment("CartPole-v1") - .env_runners(num_env_runners=0) - ) + base_config = PPOConfig().environment("CartPole-v1").env_runners(num_env_runners=0) # Convert to a plain dict for Tune. Note that this is usually not needed, you can # pass into the below Tune Tuner any instantiated RLlib AlgorithmConfig object. # However, for demonstration purposes, we show here how you can add other, arbitrary diff --git a/rllib/examples/ray_tune/custom_logger.py b/rllib/examples/ray_tune/custom_logger.py index 5aedacc512a5..9823e47daaec 100644 --- a/rllib/examples/ray_tune/custom_logger.py +++ b/rllib/examples/ray_tune/custom_logger.py @@ -87,12 +87,7 @@ def flush(self): if __name__ == "__main__": config = ( - PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) - .environment("CartPole-v1") + PPOConfig().environment("CartPole-v1") # Setting up a custom logger config. # ---------------------------------- # The following are different examples of custom logging setups: diff --git a/rllib/examples/ray_tune/custom_progress_reporter.py b/rllib/examples/ray_tune/custom_progress_reporter.py index 5aee72218ddf..092b0710db57 100644 --- a/rllib/examples/ray_tune/custom_progress_reporter.py +++ b/rllib/examples/ray_tune/custom_progress_reporter.py @@ -94,10 +94,6 @@ config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("env") .multi_agent( # Define 3 policies. Note that in our simple setup, they are all configured diff --git a/rllib/examples/rl_modules/action_masking_rl_module.py b/rllib/examples/rl_modules/action_masking_rl_module.py index 0fc0492a6843..a4ac85c26ac8 100644 --- a/rllib/examples/rl_modules/action_masking_rl_module.py +++ b/rllib/examples/rl_modules/action_masking_rl_module.py @@ -92,11 +92,6 @@ base_config = ( PPOConfig() - .api_stack( - # This example runs only under the new pai stack. - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment( env=ActionMaskEnv, env_config={ diff --git a/rllib/examples/rl_modules/autoregressive_actions_rl_module.py b/rllib/examples/rl_modules/autoregressive_actions_rl_module.py index 2f7e576a5bd5..af1e27146582 100644 --- a/rllib/examples/rl_modules/autoregressive_actions_rl_module.py +++ b/rllib/examples/rl_modules/autoregressive_actions_rl_module.py @@ -93,10 +93,6 @@ .env_runners( num_env_runners=0, ) - .api_stack( - enable_env_runner_and_connector_v2=True, - enable_rl_module_and_learner=True, - ) .evaluation( evaluation_num_env_runners=1, evaluation_interval=1, diff --git a/rllib/examples/rl_modules/classes/mobilenet_rlm.py b/rllib/examples/rl_modules/classes/mobilenet_rlm.py index 206bdda36bd8..8f3a86e69235 100644 --- a/rllib/examples/rl_modules/classes/mobilenet_rlm.py +++ b/rllib/examples/rl_modules/classes/mobilenet_rlm.py @@ -53,10 +53,6 @@ def setup(self): config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .rl_module(rl_module_spec=RLModuleSpec(module_class=MobileNetTorchPPORLModule)) .environment( RandomEnv, diff --git a/rllib/examples/rl_modules/custom_lstm_rl_module.py b/rllib/examples/rl_modules/custom_lstm_rl_module.py index 88a98cd5476e..85b160808bd7 100644 --- a/rllib/examples/rl_modules/custom_lstm_rl_module.py +++ b/rllib/examples/rl_modules/custom_lstm_rl_module.py @@ -63,10 +63,6 @@ if __name__ == "__main__": args = parser.parse_args() - assert ( - args.enable_new_api_stack - ), "Must set --enable-new-api-stack when running this script!" - if args.num_agents == 0: register_env("env", lambda cfg: StatelessCartPole()) else: diff --git a/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_config.py b/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_config.py index 363ea610db67..21b68184051f 100644 --- a/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_config.py +++ b/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_config.py @@ -12,6 +12,10 @@ # Configure an old stack default ModelV2. config_old_stack = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("CartPole-v1") .training( lr=0.0003, diff --git a/rllib/execution/rollout_ops.py b/rllib/execution/rollout_ops.py index 255d0ba4ba71..7c3fd489802f 100644 --- a/rllib/execution/rollout_ops.py +++ b/rllib/execution/rollout_ops.py @@ -63,12 +63,18 @@ def synchronous_parallel_sample( # Define an RLlib Algorithm. from ray.rllib.algorithms.ppo import PPO, PPOConfig - config = PPOConfig().environment("CartPole-v1") - algorithm = PPO(config=config) - # 2 remote workers (num_workers=2): - batches = synchronous_parallel_sample(worker_set=algorithm.env_runner_group, - concat=False) - print(len(batches)) + config = ( + PPOConfig() + .environment("CartPole-v1") + ) + algorithm = config.build() + # 2 remote EnvRunners (num_env_runners=2): + episodes = synchronous_parallel_sample( + worker_set=algorithm.env_runner_group, + _uses_new_env_runners=True, + concat=False, + ) + print(len(episodes)) .. testoutput:: diff --git a/rllib/models/tests/test_attention_nets.py b/rllib/models/tests/test_attention_nets.py index bed5ad726fbc..e105955908da 100644 --- a/rllib/models/tests/test_attention_nets.py +++ b/rllib/models/tests/test_attention_nets.py @@ -18,6 +18,7 @@ class TestAttentionNets(unittest.TestCase): config = { + "_enable_new_api_stack": False, "env": StatelessCartPole, "gamma": 0.99, "num_envs_per_env_runner": 20, @@ -40,6 +41,7 @@ def tearDownClass(cls) -> None: def test_attention_nets_w_prev_actions_and_prev_rewards(self): """Tests attention prev-a/r input insertions using complex actions.""" config = { + "_enable_new_api_stack": False, "env": RandomEnv, "env_config": { "config": { @@ -110,38 +112,6 @@ def test_ppo_attention_net_learning(self): run_config=air.RunConfig(stop=self.stop, verbose=1), ).fit() - # TODO: (sven) causes memory failures/timeouts on Travis. - # Re-enable this once we have fast attention in master branch. - def test_impala_attention_net_learning(self): - return - # ModelCatalog.register_custom_model("attention_net", GTrXLNet) - # config = dict( - # self.config, **{ - # "num_env_runners": 4, - # "num_gpus": 0, - # "entropy_coeff": 0.01, - # "vf_loss_coeff": 0.001, - # "lr": 0.0008, - # "model": { - # "custom_model": "attention_net", - # "max_seq_len": 65, - # "custom_model_config": { - # "num_transformer_units": 1, - # "attention_dim": 64, - # "num_heads": 1, - # "memory_inference": 10, - # "memory_training": 10, - # "head_dim": 32, - # "position_wise_mlp_dim": 32, - # }, - # }, - # }) - # tune.Tuner( - # "IMPALA", - # param_space=config, - # run_config=air.RunConfig(stop=self.stop, verbose=1), - # ).fit() - if __name__ == "__main__": import pytest diff --git a/rllib/models/tests/test_models.py b/rllib/models/tests/test_models.py index 86100e86b690..8ba77be666e1 100644 --- a/rllib/models/tests/test_models.py +++ b/rllib/models/tests/test_models.py @@ -62,6 +62,10 @@ def test_tf_modelv2(self): def test_modelv3(self): config = ( ppo.PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("CartPole-v1") .framework("tf") .env_runners(num_env_runners=0) diff --git a/rllib/models/tests/test_preprocessors.py b/rllib/models/tests/test_preprocessors.py index 03a344de3289..f4451f15f11a 100644 --- a/rllib/models/tests/test_preprocessors.py +++ b/rllib/models/tests/test_preprocessors.py @@ -38,6 +38,10 @@ def tearDownClass(cls) -> None: def test_preprocessing_disabled_modelv2(self): config = ( ppo.PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment( "ray.rllib.examples.envs.classes.random_env.RandomEnv", env_config={ diff --git a/rllib/offline/tests/test_offline_env_runner.py b/rllib/offline/tests/test_offline_env_runner.py index 08c0ee5fa5c7..41dbb016b092 100644 --- a/rllib/offline/tests/test_offline_env_runner.py +++ b/rllib/offline/tests/test_offline_env_runner.py @@ -18,11 +18,6 @@ def setUp(self) -> None: self.base_path = pathlib.Path("/tmp/") self.config = ( PPOConfig() - # Enable new API stack and use EnvRunner. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners( # This defines how many rows per file we will # have (given `num_rows_per_file` in the diff --git a/rllib/offline/tests/test_offline_prelearner.py b/rllib/offline/tests/test_offline_prelearner.py index 503c35b0dfd1..919a9c12343b 100644 --- a/rllib/offline/tests/test_offline_prelearner.py +++ b/rllib/offline/tests/test_offline_prelearner.py @@ -238,10 +238,6 @@ def test_offline_prelearner_sample_from_episode_data(self): .env_runners( batch_mode="complete_episodes", ) - .api_stack( - enable_env_runner_and_connector_v2=True, - enable_rl_module_and_learner=True, - ) .offline_data( output=data_path, output_write_episodes=True, diff --git a/rllib/policy/tests/test_compute_log_likelihoods.py b/rllib/policy/tests/test_compute_log_likelihoods.py index 3aa8f19b776d..c1024dd80b17 100644 --- a/rllib/policy/tests/test_compute_log_likelihoods.py +++ b/rllib/policy/tests/test_compute_log_likelihoods.py @@ -129,20 +129,30 @@ def tearDownClass(cls) -> None: def test_ppo_cont(self): """Tests PPO's (cont. actions) compute_log_likelihoods method.""" - config = ppo.PPOConfig() - config.training( - model={ - "fcnet_hiddens": [10], - "fcnet_activation": "linear", - } + config = ( + ppo.PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .training( + model={ + "fcnet_hiddens": [10], + "fcnet_activation": "linear", + } + ) + .debugging(seed=42) ) - config.debugging(seed=42) prev_a = np.array([0.0]) do_test_log_likelihood(ppo.PPO, config, prev_a, continuous=True) def test_ppo_discr(self): """Tests PPO's (discr. actions) compute_log_likelihoods method.""" config = ppo.PPOConfig() + config.api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) config.debugging(seed=42) prev_a = np.array(0) do_test_log_likelihood(ppo.PPO, config, prev_a) diff --git a/rllib/policy/tests/test_export_checkpoint_and_model.py b/rllib/policy/tests/test_export_checkpoint_and_model.py index 3515525ef1d3..2df1ff7defe5 100644 --- a/rllib/policy/tests/test_export_checkpoint_and_model.py +++ b/rllib/policy/tests/test_export_checkpoint_and_model.py @@ -21,6 +21,10 @@ def export_test( ): cls = get_trainable_cls(alg_name) config = cls.get_default_config() + config.api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) config.framework(framework) # Switch on saving native DL-framework (tf, torch) model files. config.checkpointing(export_native_model_files=True) diff --git a/rllib/policy/tests/test_policy_checkpoint_restore.py b/rllib/policy/tests/test_policy_checkpoint_restore.py index 87ff462e7787..cb7f15c9918b 100644 --- a/rllib/policy/tests/test_policy_checkpoint_restore.py +++ b/rllib/policy/tests/test_policy_checkpoint_restore.py @@ -16,6 +16,10 @@ def _do_checkpoint_twice_test(framework): # Checks if we can load a policy from a checkpoint (at least) twice config = ( PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) .env_runners(num_env_runners=0) .evaluation(evaluation_num_env_runners=0) ) @@ -94,10 +98,12 @@ def test_restore_checkpoint_with_nested_obs_space(self): space.original_space = gym.spaces.Discrete(2) space = space.original_space - # TODO(Artur): Construct a PPO policy here without the algorithm once we are - # able to do that with RLModules. policy = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment( observation_space=obs_space, action_space=gym.spaces.Discrete(2) ) diff --git a/rllib/policy/tests/test_policy_map.py b/rllib/policy/tests/test_policy_map.py index 94471a393c61..0a8911b895a5 100644 --- a/rllib/policy/tests/test_policy_map.py +++ b/rllib/policy/tests/test_policy_map.py @@ -23,7 +23,14 @@ def test_policy_map(self): # This is testing policy map which is something that will be deprecated in # favor of MultiAgentRLModules in the future. So we'll disable the RLModule API # for this test for now. - config = PPOConfig().framework("tf2") + config = ( + PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .framework("tf2") + ) obs_space = gym.spaces.Box(-1.0, 1.0, (4,), dtype=np.float32) dummy_obs = obs_space.sample() act_space = gym.spaces.Discrete(10000) diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py index 5db2d0a8262d..8f1675bab881 100644 --- a/rllib/tests/run_regression_tests.py +++ b/rllib/tests/run_regression_tests.py @@ -1,20 +1,6 @@ #!/usr/bin/env python -# Runs one or more regression tests. Retries tests up to 3 times. -# -# Example usage: -# $ python run_regression_tests.py regression-tests/cartpole-es-[tf|torch].yaml -# -# When using in BAZEL (with py_test), e.g. see in ray/rllib/BUILD: -# py_test( -# name = "run_regression_tests", -# main = "tests/run_regression_tests.py", -# tags = ["learning_tests"], -# size = "medium", # 5min timeout -# srcs = ["tests/run_regression_tests.py"], -# data = glob(["tuned_examples/regression_tests/*.yaml"]), -# # Pass `BAZEL` option and the path to look for yaml regression files. -# args = ["BAZEL", "tuned_examples/regression_tests"] -# ) + +# @OldAPIStack import argparse import os @@ -104,15 +90,6 @@ default=None, help="The WandB run name to use.", ) -# parser.add_argument( -# "--wandb-from-checkpoint", -# type=str, -# default=None, -# help=( -# "The WandB checkpoint location (e.g. `[team name]/[project name]/checkpoint_" -# "[run name]:v[version]`) from which to resume an experiment." -# ), -# ) parser.add_argument( "--checkpoint-freq", type=int, diff --git a/rllib/tests/test_dependency_torch.py b/rllib/tests/test_dependency_torch.py index 7048d0f92cf2..bcd720a6c7aa 100755 --- a/rllib/tests/test_dependency_torch.py +++ b/rllib/tests/test_dependency_torch.py @@ -21,6 +21,10 @@ # Note: No ray.init(), to test it works without Ray config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("CartPole-v1") .framework("tf") .env_runners(num_env_runners=0) diff --git a/rllib/tests/test_gpus.py b/rllib/tests/test_gpus.py index 54ef39821f23..24511e14367a 100644 --- a/rllib/tests/test_gpus.py +++ b/rllib/tests/test_gpus.py @@ -18,7 +18,15 @@ def test_gpus_in_non_local_mode(self): actual_gpus = torch.cuda.device_count() print(f"Actual GPUs found (by torch): {actual_gpus}") - config = PPOConfig().env_runners(num_env_runners=2).environment("CartPole-v1") + config = ( + PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .env_runners(num_env_runners=2) + .environment("CartPole-v1") + ) # Expect errors when we run a config w/ num_gpus>0 w/o a GPU # and _fake_gpus=False. @@ -82,7 +90,15 @@ def test_gpus_in_local_mode(self): actual_gpus_available = torch.cuda.device_count() - config = PPOConfig().env_runners(num_env_runners=2).environment("CartPole-v1") + config = ( + PPOConfig() + .api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) + .env_runners(num_env_runners=2) + .environment("CartPole-v1") + ) # Expect no errors in local mode. for num_gpus in [0, 0.1, 1, actual_gpus_available + 4]: diff --git a/rllib/tests/test_io.py b/rllib/tests/test_io.py index 6e4b2298b5d7..be0c8aaafafb 100644 --- a/rllib/tests/test_io.py +++ b/rllib/tests/test_io.py @@ -57,6 +57,10 @@ def tearDown(self): def write_outputs(self, output, fw, output_config=None): config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("CartPole-v1") .framework(fw) .training(train_batch_size=250) @@ -98,6 +102,10 @@ def test_agent_output_infos(self): def test_agent_input_dir(self): config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("CartPole-v1") .evaluation(off_policy_estimation_methods={}) .training(train_batch_size=250) @@ -125,6 +133,10 @@ def test_split_by_episode(self): def test_agent_input_postprocessing_enabled(self): config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("CartPole-v1") .training(train_batch_size=250) .offline_data( @@ -169,6 +181,10 @@ def test_agent_input_postprocessing_enabled(self): def test_agent_input_eval_sampler(self): config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("CartPole-v1") .offline_data( postprocess_inputs=True, # adds back 'advantages' @@ -210,6 +226,10 @@ def input_creator(ioctx: IOContext) -> InputReader: config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("CartPole-v1") .offline_data(input_=input_procedure) .evaluation(off_policy_estimation_methods={}) @@ -229,6 +249,10 @@ def test_multiple_output_workers(self): config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("CartPole-v1") .env_runners(num_env_runners=2) .training(train_batch_size=500) diff --git a/rllib/tests/test_lstm.py b/rllib/tests/test_lstm.py index 969683d8ca38..9481205f9291 100644 --- a/rllib/tests/test_lstm.py +++ b/rllib/tests/test_lstm.py @@ -178,6 +178,10 @@ def test_minibatch_sequencing(self): register_env("counter", lambda _: DebugCounterEnv()) config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("counter") .framework("tf") .env_runners(num_env_runners=0, rollout_fragment_length=20) diff --git a/rllib/tests/test_nn_framework_import_errors.py b/rllib/tests/test_nn_framework_import_errors.py index 61c06816d09d..d117bf0f385d 100644 --- a/rllib/tests/test_nn_framework_import_errors.py +++ b/rllib/tests/test_nn_framework_import_errors.py @@ -9,7 +9,15 @@ def test_dont_import_torch_error(): """Check error being thrown, if torch not installed but configured.""" # Do not import tf for testing purposes. os.environ["RLLIB_TEST_NO_TORCH_IMPORT"] = "1" - config = ppo.PPOConfig().environment("CartPole-v1").framework("torch") + config = ( + ppo.PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .environment("CartPole-v1") + .framework("torch") + ) with pytest.raises(ImportError, match="However, no installation was found"): config.build() diff --git a/rllib/tests/test_pettingzoo_env.py b/rllib/tests/test_pettingzoo_env.py index e77d42a89811..e42d18b77f5c 100644 --- a/rllib/tests/test_pettingzoo_env.py +++ b/rllib/tests/test_pettingzoo_env.py @@ -54,6 +54,10 @@ def env_creator(config): config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("pistonball", env_config={"local_ratio": 0.5}) .multi_agent( # Set of policy IDs (by default, will use Algorithms's @@ -82,6 +86,10 @@ def test_pettingzoo_env(self): config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .environment("simple_spread") .env_runners(num_env_runners=0, rollout_fragment_length=30) .debugging(log_level="DEBUG") diff --git a/rllib/tests/test_placement_groups.py b/rllib/tests/test_placement_groups.py index 2e056e09d1a9..2d268a9a4b10 100644 --- a/rllib/tests/test_placement_groups.py +++ b/rllib/tests/test_placement_groups.py @@ -35,6 +35,10 @@ def test_overriding_default_resource_request(self): # 3 Trials: Can only run 2 at a time (num_cpus=6; needed: 3). config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .training( model={"fcnet_hiddens": [10]}, lr=tune.grid_search([0.1, 0.01, 0.001]) ) @@ -71,6 +75,10 @@ def default_resource_request(cls, config): def test_default_resource_request(self): config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .resources(placement_strategy="SPREAD") .env_runners( num_env_runners=2, @@ -98,6 +106,10 @@ def test_default_resource_request(self): def test_default_resource_request_plus_manual_leads_to_error(self): config = ( PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .training(model={"fcnet_hiddens": [10]}) .environment("CartPole-v1") .env_runners(num_env_runners=0) diff --git a/rllib/tests/test_ray_client.py b/rllib/tests/test_ray_client.py deleted file mode 100644 index cce060f8b69c..000000000000 --- a/rllib/tests/test_ray_client.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import unittest - -import ray -from ray import air, tune -from ray.air.constants import TRAINING_ITERATION -import ray.rllib.algorithms.ppo as ppo -from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole -from ray.util.client.ray_client_helpers import ray_start_client_server - - -class TestRayClient(unittest.TestCase): - def test_connection(self): - with ray_start_client_server(): - assert ray.util.client.ray.is_connected() - assert ray.util.client.ray.is_connected() is False - - def test_custom_experiment(self): - with ray_start_client_server(): - assert ray.util.client.ray.is_connected() - - config = { - # Special flag signalling `my_experiment` how many iters to do. - "train-iterations": 2, - "lr": 0.01, - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), - "num_env_runners": 0, - "framework": "tf", - } - resources = ppo.PPO.default_resource_request(config) - from ray.rllib.examples.ray_tune.custom_experiment import my_experiment - - tune.Tuner( - tune.with_resources(my_experiment, resources), - param_space=config, - ).fit() - - def test_cartpole_lstm(self): - with ray_start_client_server(): - assert ray.util.client.ray.is_connected() - - config = { - "env": StatelessCartPole, - } - - stop = {TRAINING_ITERATION: 3} - - tune.Tuner( - "PPO", - param_space=config, - run_config=air.RunConfig(stop=stop, verbose=2), - ).fit() - - -if __name__ == "__main__": - import pytest - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/tests/test_timesteps.py b/rllib/tests/test_timesteps.py index 6b95864d26aa..f0a081c57246 100644 --- a/rllib/tests/test_timesteps.py +++ b/rllib/tests/test_timesteps.py @@ -20,6 +20,10 @@ def test_timesteps(self): """Test whether PG can be built with both frameworks.""" config = ( ppo.PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) .experimental(_disable_preprocessor_api=True) .environment(RandomEnv) .env_runners(num_env_runners=0) diff --git a/rllib/train.py b/rllib/train.py index 5f9439b0138a..32f59ec39463 100755 --- a/rllib/train.py +++ b/rllib/train.py @@ -91,6 +91,10 @@ def load_experiments_from_file( experiments = yaml.safe_load(f) if stop is not None and stop != "{}": raise ValueError("`stop` criteria only supported for python files.") + # Make sure yaml experiments are always old API stack. + for experiment in experiments.values(): + experiment["config"]["enable_rl_module_and_learner"] = False + experiment["config"]["enable_env_runner_and_connector_v2"] = False # Python file case (ensured by file type enum) else: module_name = os.path.basename(config_file).replace(".py", "") diff --git a/rllib/tuned_examples/bc/cartpole_recording.py b/rllib/tuned_examples/bc/cartpole_recording.py index 4ce66aef5c47..e34b76a2c953 100644 --- a/rllib/tuned_examples/bc/cartpole_recording.py +++ b/rllib/tuned_examples/bc/cartpole_recording.py @@ -15,11 +15,6 @@ config = ( PPOConfig() - # Enable new API stack and use EnvRunner. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners( rollout_fragment_length=1000, num_env_runners=0, batch_mode="truncate_episodes" ) diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari.py index b79817b9e976..4cbe1fa6e204 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari.py @@ -302,11 +302,6 @@ def stop_all(self): }, clip_rewards=True, ) - # Enable new API stack and use EnvRunner. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners( # Every 4 agent steps a training update is performed. rollout_fragment_length=4, diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py index e76bf40cdcdd..2e8deb84d354 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py @@ -295,11 +295,6 @@ def stop_all(self): }, clip_rewards=True, ) - # Enable new API stack and use EnvRunner. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners( # Every 4 agent steps a training update is performed. rollout_fragment_length=4, diff --git a/rllib/tuned_examples/dqn/cartpole_dqn.py b/rllib/tuned_examples/dqn/cartpole_dqn.py index 6b417a9c9782..821ff7c8d5bb 100644 --- a/rllib/tuned_examples/dqn/cartpole_dqn.py +++ b/rllib/tuned_examples/dqn/cartpole_dqn.py @@ -13,10 +13,6 @@ config = ( DQNConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment(env="CartPole-v1") .training( lr=0.0005 * (args.num_gpus or 1) ** 0.5, diff --git a/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py b/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py index 280822465c5f..726ec923cf99 100644 --- a/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py +++ b/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py @@ -25,10 +25,6 @@ config = ( DQNConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment(env="multi_agent_cartpole", env_config={"num_agents": args.num_agents}) .training( lr=0.00065 * (args.num_gpus or 1) ** 0.5, diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py index ba8be549c89c..e29a99ebc155 100644 --- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py +++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py @@ -85,11 +85,6 @@ def stop_all(self): config = ( PPOConfig() .environment(env=tune.grid_search(list(benchmark_envs.keys()))) - # Enable new API stack and use EnvRunner. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners( # Following the paper. num_env_runners=32, diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py index 8116a2431cd5..c3a4c37c1437 100644 --- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py +++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py @@ -69,11 +69,6 @@ config = ( PPOConfig() .environment(env=env) - # Enable new API stack and use EnvRunner. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners( rollout_fragment_length=1, num_env_runners=num_rollout_workers, diff --git a/rllib/tuned_examples/ppo/cartpole-ppo.yaml b/rllib/tuned_examples/ppo/cartpole-ppo.yaml deleted file mode 100644 index 94a093eec3b3..000000000000 --- a/rllib/tuned_examples/ppo/cartpole-ppo.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# @OldAPIStack -cartpole-ppo: - env: CartPole-v1 - run: PPO - stop: - env_runners/episode_return_mean: 150 - timesteps_total: 100000 - config: - # Works for both torch and tf2. - framework: torch - gamma: 0.99 - lr: 0.0003 - num_env_runners: 1 - num_epochs: 6 - vf_loss_coeff: 0.01 - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true diff --git a/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py b/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py index 8d9405d7560f..523eaf0996f4 100644 --- a/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py +++ b/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py @@ -25,11 +25,6 @@ config = ( PPOConfig() - # Enable new API stack and use EnvRunner. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("cartpole_truncated") .env_runners(num_envs_per_env_runner=10) .training( @@ -39,7 +34,10 @@ ) # For evaluation, use the "real" CartPole-v1 env (up to 500 steps). .evaluation( - evaluation_config=PPOConfig.overrides(env="CartPole-v1"), + evaluation_config=PPOConfig.overrides( + env="CartPole-v1", + explore=False, + ), evaluation_interval=1, evaluation_num_env_runners=1, ) @@ -47,7 +45,7 @@ stop = { f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 500000, - f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 200.0, + f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 80.0, } diff --git a/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py b/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py index bd3794daf41d..deb56f84ca02 100644 --- a/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py +++ b/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py @@ -4,10 +4,6 @@ config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) # Switch off np.random, which is known to have memory leaks. .environment(RandomLargeObsSpaceEnv, env_config={"static_samples": True}) .env_runners( diff --git a/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py b/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py index 7e4f74ea50a8..8130cdda1af9 100644 --- a/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py +++ b/rllib/tuned_examples/ppo/multi_agent_cartpole_ppo.py @@ -22,10 +22,6 @@ config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("multi_agent_cartpole", env_config={"num_agents": args.num_agents}) .rl_module( model_config=DefaultModelConfig( diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py index 9ad40c4c2b47..92e364b4343c 100644 --- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py +++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo.py @@ -23,10 +23,6 @@ config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("multi_agent_pendulum", env_config={"num_agents": args.num_agents}) .env_runners( env_to_module_connector=lambda env: MeanStdFilter(multi_agent=True), diff --git a/rllib/tuned_examples/ppo/multi_agent_stateless_cartpole_ppo.py b/rllib/tuned_examples/ppo/multi_agent_stateless_cartpole_ppo.py index d700cb7ab0c8..3e63f299793e 100644 --- a/rllib/tuned_examples/ppo/multi_agent_stateless_cartpole_ppo.py +++ b/rllib/tuned_examples/ppo/multi_agent_stateless_cartpole_ppo.py @@ -27,11 +27,6 @@ config = ( PPOConfig() - # Enable new API stack and use EnvRunner. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("multi_stateless_cart") .env_runners( env_to_module_connector=lambda env: MeanStdFilter(multi_agent=True), diff --git a/rllib/tuned_examples/ppo/pendulum-ppo.yaml b/rllib/tuned_examples/ppo/pendulum-ppo.yaml deleted file mode 100644 index 7ab57c621a97..000000000000 --- a/rllib/tuned_examples/ppo/pendulum-ppo.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# @OldAPIStack -# Can expect improvement to -140 reward in ~300-500k timesteps. -pendulum-ppo: - env: Pendulum-v1 - run: PPO - stop: - env_runners/episode_return_mean: -400 - timesteps_total: 400000 - config: - # Works for both torch and tf. - framework: torch - train_batch_size: 512 - vf_clip_param: 10.0 - num_env_runners: 0 - num_envs_per_env_runner: 20 - lambda: 0.1 - gamma: 0.95 - lr: 0.0003 - minibatch_size: 64 - observation_filter: MeanStdFilter - model: - fcnet_activation: relu diff --git a/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml b/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml deleted file mode 100644 index 490b63245f15..000000000000 --- a/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# @OldAPIStack -repeat-after-me-ppo-w-lstm: - # Default case: Discrete(2) observations/actions. - env: ray.rllib.examples.envs.classes.repeat_after_me_env.RepeatAfterMeEnv - run: PPO - stop: - env_runners/episode_return_mean: 50 - timesteps_total: 100000 - config: - # Works for both torch and tf. - framework: torch - # Make env partially observable. - env_config: - config: - repeat_delay: 2 - gamma: 0.9 - lr: 0.0003 - num_env_runners: 0 - num_envs_per_env_runner: 20 - num_epochs: 5 - entropy_coeff: 0.00001 - model: - use_lstm: true - lstm_cell_size: 64 - max_seq_len: 20 - fcnet_hiddens: [64] - vf_share_layers: true diff --git a/rllib/tuned_examples/sac/benchmark_sac_mujoco.py b/rllib/tuned_examples/sac/benchmark_sac_mujoco.py index d461730641a0..2ec59d9f3af8 100644 --- a/rllib/tuned_examples/sac/benchmark_sac_mujoco.py +++ b/rllib/tuned_examples/sac/benchmark_sac_mujoco.py @@ -76,11 +76,6 @@ def stop_all(self): config = ( SACConfig() .environment(env=tune.grid_search(list(benchmark_envs.keys()))) - # Enable new API stack and use EnvRunner. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners( rollout_fragment_length=1, num_env_runners=0, diff --git a/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py b/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py index 66d4a1f46d5f..8ac4faf0b188 100644 --- a/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py +++ b/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py @@ -63,11 +63,6 @@ config = ( SACConfig() .environment(env=env) - # Enable new API stack and use EnvRunner. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .env_runners( rollout_fragment_length="auto", num_env_runners=1, diff --git a/rllib/tuned_examples/sac/halfcheetah_sac.py b/rllib/tuned_examples/sac/halfcheetah_sac.py index dd9d28c715c0..d763631035b8 100644 --- a/rllib/tuned_examples/sac/halfcheetah_sac.py +++ b/rllib/tuned_examples/sac/halfcheetah_sac.py @@ -16,10 +16,6 @@ config = ( SACConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("HalfCheetah-v4") .training( initial_alpha=1.001, diff --git a/rllib/tuned_examples/sac/humanoid_sac.py b/rllib/tuned_examples/sac/humanoid_sac.py index 8ecba7d4cfa0..8311ee6dc134 100644 --- a/rllib/tuned_examples/sac/humanoid_sac.py +++ b/rllib/tuned_examples/sac/humanoid_sac.py @@ -25,10 +25,6 @@ config = ( SACConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("Humanoid-v4") .training( initial_alpha=1.001, diff --git a/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py b/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py index 481c61e3824b..2d2729b6c10c 100644 --- a/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py +++ b/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py @@ -27,10 +27,6 @@ config = ( SACConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("multi_agent_pendulum", env_config={"num_agents": args.num_agents}) .training( initial_alpha=1.001, diff --git a/rllib/tuned_examples/sac/pendulum_sac.py b/rllib/tuned_examples/sac/pendulum_sac.py index 16635e32c96a..466b7fc09413 100644 --- a/rllib/tuned_examples/sac/pendulum_sac.py +++ b/rllib/tuned_examples/sac/pendulum_sac.py @@ -15,10 +15,6 @@ config = ( SACConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("Pendulum-v1") .training( initial_alpha=1.001, diff --git a/rllib/utils/exploration/tests/test_curiosity.py b/rllib/utils/exploration/tests/test_curiosity.py index bcc603171264..adbab1cf4573 100644 --- a/rllib/utils/exploration/tests/test_curiosity.py +++ b/rllib/utils/exploration/tests/test_curiosity.py @@ -50,6 +50,10 @@ def test_curiosity_on_frozen_lake(self): config = ( ppo.PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) # A very large frozen-lake that's hard for a random policy to solve # due to 0.0 feedback. .environment( @@ -88,7 +92,8 @@ def test_curiosity_on_frozen_lake(self): "type": "StochasticSampling", }, }, - ).training(lr=0.001) + ) + .training(lr=0.001) ) num_iterations = 10 diff --git a/rllib/utils/exploration/tests/test_explorations.py b/rllib/utils/exploration/tests/test_explorations.py index e7e29002e8f0..c8a1c14c1932 100644 --- a/rllib/utils/exploration/tests/test_explorations.py +++ b/rllib/utils/exploration/tests/test_explorations.py @@ -87,7 +87,13 @@ def test_impala(self): def test_ppo_discr(self): config = ( - ppo.PPOConfig().environment("CartPole-v1").env_runners(num_env_runners=0) + ppo.PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .environment("CartPole-v1") + .env_runners(num_env_runners=0) ) do_test_explorations( config, @@ -97,7 +103,13 @@ def test_ppo_discr(self): def test_ppo_cont(self): config = ( - ppo.PPOConfig().environment("Pendulum-v1").env_runners(num_env_runners=0) + ppo.PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .environment("Pendulum-v1") + .env_runners(num_env_runners=0) ) do_test_explorations( config, diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index f9dd0e2edb1a..95724d911bfe 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1,6 +1,4 @@ import argparse -from collections import Counter -import copy import json import logging import os @@ -18,7 +16,6 @@ Type, Union, ) -import yaml import gymnasium as gym from gymnasium.spaces import Box, Discrete, MultiDiscrete, MultiBinary @@ -31,12 +28,9 @@ from ray import air, tune from ray.air.constants import TRAINING_ITERATION from ray.air.integrations.wandb import WandbLoggerCallback, WANDB_ENV_VAR -from ray.rllib.common import SupportedFileType from ray.rllib.core import DEFAULT_MODULE_ID, Columns from ray.rllib.env.wrappers.atari_wrappers import is_atari, wrap_deepmind -from ray.rllib.train import load_experiments_from_file from ray.rllib.utils.annotations import OldAPIStack -from ray.rllib.utils.deprecation import Deprecated from ray.rllib.utils.framework import try_import_jax, try_import_tf, try_import_torch from ray.rllib.utils.metrics import ( DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY, @@ -44,15 +38,13 @@ EPISODE_RETURN_MEAN, EVALUATION_RESULTS, NUM_ENV_STEPS_TRAINED, - NUM_ENV_STEPS_TRAINED_LIFETIME, NUM_ENV_STEPS_SAMPLED_LIFETIME, - NUM_EPISODES_LIFETIME, ) from ray.rllib.utils.typing import ResultDict from ray.rllib.utils.error import UnsupportedSpaceException -from ray.tune import CLIReporter, run_experiments +from ray.tune import CLIReporter if TYPE_CHECKING: @@ -958,326 +950,6 @@ def check_train_results(train_results: ResultDict): return train_results -@Deprecated(new="run_learning_tests_from_yaml_or_py(config_files=...)", error=False) -def run_learning_tests_from_yaml( - yaml_files: List[str], - *, - framework: Optional[str] = None, - max_num_repeats: int = 2, - use_pass_criteria_as_stop: bool = True, - smoke_test: bool = False, -): - return run_learning_tests_from_yaml_or_py( - yaml_files, - framework=framework, - max_num_repeats=max_num_repeats, - use_pass_criteria_as_stop=use_pass_criteria_as_stop, - smoke_test=smoke_test, - ) - - -def run_learning_tests_from_yaml_or_py( - config_files: List[str], - *, - framework: Optional[str] = None, - max_num_repeats: int = 2, - use_pass_criteria_as_stop: bool = True, - smoke_test: bool = False, -) -> Dict[str, Any]: - """Runs the given experiments in config_files and returns results dict. - - Args: - framework: The framework to use for running this test. If None, - run the test on all frameworks. - config_files: List of yaml or py config file names. - max_num_repeats: How many times should we repeat a failed - experiment? - use_pass_criteria_as_stop: Configure the Trial so that it stops - as soon as pass criterias are met. - smoke_test: Whether this is just a smoke-test. If True, - set time_total_s to 5min and don't early out due to rewards - or timesteps reached. - - Returns: - A results dict mapping strings (e.g. "time_taken", "stats", "passed") to - the respective stats/values. - """ - print("Will run the following config files:") - for config_file in config_files: - print("->", config_file) - - # All trials we'll ever run in this test script. - all_trials = [] - # The experiments (by name) we'll run up to `max_num_repeats` times. - experiments = {} - # The results per experiment. - checks = {} - # Metrics per experiment. - stats = {} - - start_time = time.monotonic() - - def should_check_eval(experiment): - # If we have evaluation workers, use their rewards. - # This is useful for offline learning tests, where - # we evaluate against an actual environment. - return bool(experiment["config"].get("evaluation_interval")) - - # Loop through all collected files and gather experiments. - # Set correct framework(s). - for config_file in config_files: - # For python files, need to make sure, we only deliver the module name into the - # `load_experiments_from_file` function (everything from "/ray/rllib" on). - if config_file.endswith(".py"): - if config_file.endswith( - "__init__.py" - ): # weird CI learning test (BAZEL) case - continue - tf_experiments = load_experiments_from_file( - config_file, SupportedFileType.python - ) - else: - tf_experiments = load_experiments_from_file( - config_file, SupportedFileType.yaml - ) - - # Add torch version of all experiments to the list. - for k, e in tf_experiments.items(): - # If framework given as arg, use that framework. - if framework is not None: - frameworks = [framework] - # If framework given in config, only test for that framework. - # Some algos do not have both versions available. - elif "frameworks" in e: - frameworks = e["frameworks"] - else: - # By default we don't run tf2, because tf2's multi-gpu support - # isn't complete yet. - frameworks = ["tf", "torch"] - # Pop frameworks key to not confuse Tune. - e.pop("frameworks", None) - - e["stop"] = e["stop"] if "stop" in e else {} - e["pass_criteria"] = e["pass_criteria"] if "pass_criteria" in e else {} - - check_eval = should_check_eval(e) - episode_reward_key = ( - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" - if not check_eval - else f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" - ) - - # For smoke-tests, we just run for n min. - if smoke_test: - # 0sec for each(!) experiment/trial. - # This is such that if there are many experiments/trials - # in a test (e.g. rllib_learning_test), each one can at least - # create its Algorithm and run a first iteration. - e["stop"]["time_total_s"] = 0 - else: - if use_pass_criteria_as_stop: - # We also stop early, once we reach the desired reward. - min_reward = e.get("pass_criteria", {}).get(episode_reward_key) - if min_reward is not None: - e["stop"][episode_reward_key] = min_reward - - # Generate `checks` dict for all experiments - # (tf, tf2 and/or torch). - for framework in frameworks: - k_ = k + "-" + framework - ec = copy.deepcopy(e) - ec["config"]["framework"] = framework - if framework == "tf2": - ec["config"]["eager_tracing"] = True - - checks[k_] = { - "min_reward": ec["pass_criteria"].get(episode_reward_key, 0.0), - "min_throughput": ec["pass_criteria"].get("timesteps_total", 0.0) - / (ec["stop"].get("time_total_s", 1.0) or 1.0), - "time_total_s": ec["stop"].get("time_total_s"), - "failures": 0, - "passed": False, - } - # This key would break tune. - ec.pop("pass_criteria", None) - - # One experiment to run. - experiments[k_] = ec - - # Keep track of those experiments we still have to run. - # If an experiment passes, we'll remove it from this dict. - experiments_to_run = experiments.copy() - - # When running as a release test, use `/mnt/cluster_storage` as the storage path. - release_test_storage_path = "/mnt/cluster_storage" - if os.path.exists(release_test_storage_path): - for k, e in experiments_to_run.items(): - e["storage_path"] = release_test_storage_path - - try: - ray.init(address="auto") - except ConnectionError: - ray.init() - - for i in range(max_num_repeats): - # We are done. - if len(experiments_to_run) == 0: - print("All experiments finished.") - break - - print(f"Starting learning test iteration {i}...") - - # Print out the actual config. - print("== Test config ==") - print(yaml.dump(experiments_to_run)) - - # Run remaining experiments. - trials = run_experiments( - experiments_to_run, - resume=False, - verbose=2, - progress_reporter=CLIReporter( - metric_columns={ - TRAINING_ITERATION: "iter", - "time_total_s": "time_total_s", - NUM_ENV_STEPS_SAMPLED_LIFETIME: "ts (sampled)", - NUM_ENV_STEPS_TRAINED_LIFETIME: "ts (trained)", - NUM_EPISODES_LIFETIME: "train_episodes", - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": "reward_mean", - ( - f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/" - f"{EPISODE_RETURN_MEAN}" - ): "eval_reward_mean", - }, - parameter_columns=["framework"], - sort_by_metric=True, - max_report_frequency=30, - ), - ) - - all_trials.extend(trials) - - # Check each experiment for whether it passed. - # Criteria is to a) reach reward AND b) to have reached the throughput - # defined by `NUM_ENV_STEPS_(SAMPLED|TRAINED)` / `time_total_s`. - for experiment in experiments_to_run.copy(): - print(f"Analyzing experiment {experiment} ...") - # Collect all trials within this experiment (some experiments may - # have num_samples or grid_searches defined). - trials_for_experiment = [] - for t in trials: - trial_exp = re.sub(".+/([^/]+)$", "\\1", t.local_dir) - if trial_exp == experiment: - trials_for_experiment.append(t) - print(f" ... Trials: {trials_for_experiment}.") - - check_eval = should_check_eval(experiments[experiment]) - - # Error: Increase failure count and repeat. - if any(t.status == "ERROR" for t in trials_for_experiment): - print(" ... ERROR.") - checks[experiment]["failures"] += 1 - # Smoke-tests always succeed. - elif smoke_test: - print(" ... SMOKE TEST (mark ok).") - checks[experiment]["passed"] = True - del experiments_to_run[experiment] - # Experiment finished: Check reward achieved and timesteps done - # (throughput). - else: - # Use best_result's reward to check min_reward. - if check_eval: - episode_return_mean = np.mean( - [ - t.metric_analysis[ - f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/" - f"{EPISODE_RETURN_MEAN}" - ]["max"] - for t in trials_for_experiment - ] - ) - else: - episode_return_mean = np.mean( - [ - t.metric_analysis[ - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" - ]["max"] - for t in trials_for_experiment - ] - ) - desired_reward = checks[experiment]["min_reward"] - - # Use last_result["timesteps_total"] to check throughput. - timesteps_total = np.mean( - [t.last_result["timesteps_total"] for t in trials_for_experiment] - ) - total_time_s = np.mean( - [t.last_result["time_total_s"] for t in trials_for_experiment] - ) - - # TODO(jungong) : track training- and env throughput separately. - throughput = timesteps_total / (total_time_s or 1.0) - # Throughput verification is not working. Many algorithm, e.g. TD3, - # achieves the learning goal, but fails the throughput check - # miserably. - # TODO(jungong): Figure out why. - # - # desired_throughput = checks[experiment]["min_throughput"] - desired_throughput = None - - # Record performance. - stats[experiment] = { - "episode_reward_mean": float(episode_return_mean), - "throughput": ( - float(throughput) if throughput is not None else 0.0 - ), - } - - print( - f" ... Desired reward={desired_reward}; " - f"desired throughput={desired_throughput}" - ) - - # We failed to reach desired reward or the desired throughput. - if (desired_reward and episode_return_mean < desired_reward) or ( - desired_throughput and throughput < desired_throughput - ): - print( - " ... Not successful: Actual " - f"return={episode_return_mean}; " - f"actual throughput={throughput}" - ) - checks[experiment]["failures"] += 1 - # We succeeded! - else: - print( - " ... Successful: (mark ok). Actual " - f"return={episode_return_mean}; " - f"actual throughput={throughput}" - ) - checks[experiment]["passed"] = True - del experiments_to_run[experiment] - - ray.shutdown() - - time_taken = time.monotonic() - start_time - - # Create results dict and write it to disk. - result = { - "time_taken": float(time_taken), - "trial_states": dict(Counter([trial.status for trial in all_trials])), - "last_update": float(time.time()), - "stats": stats, - "passed": [k for k, exp in checks.items() if exp["passed"]], - "not_passed": [k for k, exp in checks.items() if not exp["passed"]], - "failures": { - k: exp["failures"] for k, exp in checks.items() if exp["failures"] > 0 - }, - } - - return result - - # TODO (sven): Make this the de-facto, well documented, and unified utility for most of # our tests: # - CI (label: "learning_tests")