diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ea970c265..b45ae3192 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,8 +32,6 @@ jobs: pip install .[extra,tests,docs] # Use headless version pip install opencv-python-headless - # Tmp fix: ROM missing in the newest atari-py version - pip install atari-py==0.2.5 - name: Build the doc run: | make doc diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 63f9eafa0..45ca8f56a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -7,8 +7,6 @@ type-check: pytest: script: - python --version - # Fix to get atari ROMs - - pip install atari-py==0.2.5 # MKL_THREADING_LAYER=GNU to avoid MKL_THREADING_LAYER=INTEL incompatibility error - MKL_THREADING_LAYER=GNU make pytest diff --git a/README.md b/README.md index 54577e452..2a0701c1a 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ Documentation is available online: [https://sb3-contrib.readthedocs.io/](https:/ **Note:** Stable-Baselines3 supports PyTorch >= 1.8.1. ### Prerequisites -Stable Baselines3 requires python 3.7+. +Stable Baselines3 requires Python 3.7+. #### Windows 10 diff --git a/docs/guide/callbacks.rst b/docs/guide/callbacks.rst index 279664171..19bccb22c 100644 --- a/docs/guide/callbacks.rst +++ b/docs/guide/callbacks.rst @@ -174,7 +174,7 @@ and optionally a prefix for the checkpoints (``rl_model`` by default). checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', name_prefix='rl_model') - model = SAC('MlpPolicy', 'Pendulum-v0') + model = SAC('MlpPolicy', 'Pendulum-v1') model.learn(2000, callback=checkpoint_callback) @@ -206,13 +206,13 @@ It will save the best model if ``best_model_save_path`` folder is specified and from stable_baselines3.common.callbacks import EvalCallback # Separate evaluation env - eval_env = gym.make('Pendulum-v0') + eval_env = gym.make('Pendulum-v1') # Use deterministic actions for evaluation eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=500, deterministic=True, render=False) - model = SAC('MlpPolicy', 'Pendulum-v0') + model = SAC('MlpPolicy', 'Pendulum-v1') model.learn(5000, callback=eval_callback) @@ -234,13 +234,13 @@ Alternatively, you can pass directly a list of callbacks to the ``learn()`` meth checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/') # Separate evaluation env - eval_env = gym.make('Pendulum-v0') + eval_env = gym.make('Pendulum-v1') eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/best_model', log_path='./logs/results', eval_freq=500) # Create the callback list callback = CallbackList([checkpoint_callback, eval_callback]) - model = SAC('MlpPolicy', 'Pendulum-v0') + model = SAC('MlpPolicy', 'Pendulum-v1') # Equivalent to: # model.learn(5000, callback=[checkpoint_callback, eval_callback]) model.learn(5000, callback=callback) @@ -263,12 +263,12 @@ It must be used with the :ref:`EvalCallback` and use the event triggered by a ne from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold # Separate evaluation env - eval_env = gym.make('Pendulum-v0') + eval_env = gym.make('Pendulum-v1') # Stop training when the model reaches the reward threshold callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-200, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1) - model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1) + model = SAC('MlpPolicy', 'Pendulum-v1', verbose=1) # Almost infinite number of timesteps, but the training will stop # early as soon as the reward threshold is reached model.learn(int(1e10), callback=eval_callback) @@ -299,7 +299,7 @@ An :ref:`EventCallback` that will trigger its child callback every ``n_steps`` t checkpoint_on_event = CheckpointCallback(save_freq=1, save_path='./logs/') event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) - model = PPO('MlpPolicy', 'Pendulum-v0', verbose=1) + model = PPO('MlpPolicy', 'Pendulum-v1', verbose=1) model.learn(int(2e4), callback=event_callback) @@ -328,7 +328,7 @@ and in total for ``max_episodes * n_envs`` episodes. # Stops training when the model reaches the maximum number of episodes callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1) - model = A2C('MlpPolicy', 'Pendulum-v0', verbose=1) + model = A2C('MlpPolicy', 'Pendulum-v1', verbose=1) # Almost infinite number of timesteps, but the training will stop # early as soon as the max number of episodes is reached model.learn(int(1e10), callback=callback_max_episodes) diff --git a/docs/guide/custom_policy.rst b/docs/guide/custom_policy.rst index d17f913c5..1b8f9fb7f 100644 --- a/docs/guide/custom_policy.rst +++ b/docs/guide/custom_policy.rst @@ -407,5 +407,5 @@ you only need to specify ``net_arch=[256, 256]`` (here, two hidden layers of 256 # Custom critic architecture with two layers of 400 and 300 units policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[400, 300])) # Create the agent - model = SAC("MlpPolicy", "Pendulum-v0", policy_kwargs=policy_kwargs, verbose=1) + model = SAC("MlpPolicy", "Pendulum-v1", policy_kwargs=policy_kwargs, verbose=1) model.learn(5000) diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst index 733279bc3..a5b56b249 100644 --- a/docs/guide/examples.rst +++ b/docs/guide/examples.rst @@ -321,7 +321,7 @@ Atari Games Training a RL agent on Atari games is straightforward thanks to ``make_atari_env`` helper function. It will do `all the preprocessing `_ -and multiprocessing for you. +and multiprocessing for you. To install the Atari environments, run the command ``pip install gym[atari, accept-rom-license]`` to install the Atari environments and ROMs, or install Stable Baselines3 with ``pip install stable-baselines3[extra]`` to install this and other optional dependencies. .. image:: ../_static/img/colab-badge.svg :target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/atari_games.ipynb @@ -564,7 +564,7 @@ Behind the scene, SB3 uses an :ref:`EvalCallback `. # Create the model, the training environment # and the test environment (for evaluation) - model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1, + model = SAC('MlpPolicy', 'Pendulum-v1', verbose=1, learning_rate=1e-3, create_eval_env=True) # Evaluate the model every 1000 steps on 5 test episodes diff --git a/docs/guide/export.rst b/docs/guide/export.rst index d6fe72a78..b6884c19d 100644 --- a/docs/guide/export.rst +++ b/docs/guide/export.rst @@ -62,7 +62,7 @@ For PPO, assuming a shared feature extactor. action_hidden, value_hidden = self.extractor(observation) return self.action_net(action_hidden), self.value_net(value_hidden) - # Example: model = PPO("MlpPolicy", "Pendulum-v0") + # Example: model = PPO("MlpPolicy", "Pendulum-v1") model = PPO.load("PathToTrainedModel.zip") model.policy.to("cpu") onnxable_model = OnnxablePolicy(model.policy.mlp_extractor, model.policy.action_net, model.policy.value_net) diff --git a/docs/guide/tensorboard.rst b/docs/guide/tensorboard.rst index 0929b9eef..18f1cebc4 100644 --- a/docs/guide/tensorboard.rst +++ b/docs/guide/tensorboard.rst @@ -61,7 +61,7 @@ Here is a simple example on how to log both additional tensor or arbitrary scala from stable_baselines3 import SAC from stable_baselines3.common.callbacks import BaseCallback - model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1) + model = SAC("MlpPolicy", "Pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) class TensorboardCallback(BaseCallback): @@ -104,7 +104,7 @@ Here is an example of how to render an image to TensorBoard at regular intervals from stable_baselines3.common.callbacks import BaseCallback from stable_baselines3.common.logger import Image - model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1) + model = SAC("MlpPolicy", "Pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) class ImageRecorderCallback(BaseCallback): @@ -141,7 +141,7 @@ Here is an example of how to store a plot in TensorBoard at regular intervals: from stable_baselines3.common.callbacks import BaseCallback from stable_baselines3.common.logger import Figure - model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1) + model = SAC("MlpPolicy", "Pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) class FigureRecorderCallback(BaseCallback): @@ -251,7 +251,7 @@ can get direct access to the underlying SummaryWriter in a callback: - model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1) + model = SAC("MlpPolicy", "Pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) class SummaryWriterCallback(BaseCallback): diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 39830590d..07a0cf339 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -119,6 +119,7 @@ Release 1.3.0 (2021-10-23) Breaking Changes: ^^^^^^^^^^^^^^^^^ +- Support for Python 3.6 was removed. - ``sde_net_arch`` argument in policies is deprecated and will be removed in a future version. - ``_get_latent`` (``ActorCriticPolicy``) was removed - All logging keys now use underscores instead of spaces (@timokau). Concretely this changes: @@ -127,6 +128,7 @@ Breaking Changes: - ``rollout/exploration rate`` to ``rollout/exploration_rate`` and - ``rollout/success rate`` to ``rollout/success_rate``. + New Features: ^^^^^^^^^^^^^ - Added methods ``get_distribution`` and ``predict_values`` for ``ActorCriticPolicy`` for A2C/PPO/TRPO (@cyprienc) @@ -145,6 +147,7 @@ Bug Fixes: Deprecations: ^^^^^^^^^^^^^ +- Switched minimum Gym version to 0.21.0. Others: ^^^^^^^ diff --git a/docs/modules/ddpg.rst b/docs/modules/ddpg.rst index 24d265f00..c484a1c93 100644 --- a/docs/modules/ddpg.rst +++ b/docs/modules/ddpg.rst @@ -67,7 +67,7 @@ This example is only to demonstrate the use of the library and its functions, an from stable_baselines3 import DDPG from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise - env = gym.make("Pendulum-v0") + env = gym.make("Pendulum-v1") # The noise objects for DDPG n_actions = env.action_space.shape[-1] diff --git a/docs/modules/sac.rst b/docs/modules/sac.rst index a1156fd8c..e7f9057d5 100644 --- a/docs/modules/sac.rst +++ b/docs/modules/sac.rst @@ -73,7 +73,7 @@ This example is only to demonstrate the use of the library and its functions, an from stable_baselines3 import SAC - env = gym.make("Pendulum-v0") + env = gym.make("Pendulum-v1") model = SAC("MlpPolicy", env, verbose=1) model.learn(total_timesteps=10000, log_interval=4) diff --git a/docs/modules/td3.rst b/docs/modules/td3.rst index 3bc93d7a9..d039ae71c 100644 --- a/docs/modules/td3.rst +++ b/docs/modules/td3.rst @@ -67,7 +67,7 @@ This example is only to demonstrate the use of the library and its functions, an from stable_baselines3 import TD3 from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise - env = gym.make("Pendulum-v0") + env = gym.make("Pendulum-v1") # The noise objects for TD3 n_actions = env.action_space.shape[-1] diff --git a/setup.py b/setup.py index 3e0f788af..eabf30c66 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,7 @@ packages=[package for package in find_packages() if package.startswith("stable_baselines3")], package_data={"stable_baselines3": ["py.typed", "version.txt"]}, install_requires=[ - "gym>=0.17,<0.20", # gym 0.20 breaks atari-py behavior + "gym>=0.21", # Remember to also update gym version in "extra" below when this changes "numpy", "torch>=1.8.1", # For saving models @@ -116,7 +116,7 @@ # For render "opencv-python", # For atari games, - "atari_py==0.2.6", + "gym[atari,accept-rom-license]>=0.21", "pillow", # Tensorboard support "tensorboard>=2.2.0", diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py index 56fc14109..e1f6d3869 100644 --- a/tests/test_callbacks.py +++ b/tests/test_callbacks.py @@ -75,7 +75,7 @@ def test_callbacks(tmp_path, model_class): if model_class in [A2C, PPO]: max_episodes = 1 n_envs = 2 - # Pendulum-v0 has a timelimit of 200 timesteps + # Pendulum-v1 has a timelimit of 200 timesteps max_episode_length = 200 envs = make_vec_env(env_name, n_envs=n_envs, seed=0) @@ -99,7 +99,7 @@ def select_env(model_class) -> str: if model_class is DQN: return "CartPole-v0" else: - return "Pendulum-v0" + return "Pendulum-v1" def test_eval_callback_vec_env(): diff --git a/tests/test_custom_policy.py b/tests/test_custom_policy.py index 02fa7b4a0..e2d98fbd5 100644 --- a/tests/test_custom_policy.py +++ b/tests/test_custom_policy.py @@ -25,7 +25,7 @@ def test_flexible_mlp(model_class, net_arch): @pytest.mark.parametrize("net_arch", [[], [4], [4, 4], dict(qf=[8], pi=[8, 4])]) @pytest.mark.parametrize("model_class", [SAC, TD3]) def test_custom_offpolicy(model_class, net_arch): - _ = model_class("MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=net_arch), learning_starts=100).learn(300) + _ = model_class("MlpPolicy", "Pendulum-v1", policy_kwargs=dict(net_arch=net_arch), learning_starts=100).learn(300) @pytest.mark.parametrize("model_class", [A2C, PPO, SAC, TD3]) @@ -38,12 +38,12 @@ def test_custom_optimizer(model_class, optimizer_kwargs): kwargs = dict(n_steps=64) policy_kwargs = dict(optimizer_class=th.optim.AdamW, optimizer_kwargs=optimizer_kwargs, net_arch=[32]) - _ = model_class("MlpPolicy", "Pendulum-v0", policy_kwargs=policy_kwargs, **kwargs).learn(300) + _ = model_class("MlpPolicy", "Pendulum-v1", policy_kwargs=policy_kwargs, **kwargs).learn(300) def test_tf_like_rmsprop_optimizer(): policy_kwargs = dict(optimizer_class=RMSpropTFLike, net_arch=[32]) - _ = A2C("MlpPolicy", "Pendulum-v0", policy_kwargs=policy_kwargs).learn(500) + _ = A2C("MlpPolicy", "Pendulum-v1", policy_kwargs=policy_kwargs).learn(500) def test_dqn_custom_policy(): diff --git a/tests/test_deterministic.py b/tests/test_deterministic.py index 3712fc21a..4c92d269f 100644 --- a/tests/test_deterministic.py +++ b/tests/test_deterministic.py @@ -13,7 +13,7 @@ def test_deterministic_training_common(algo): rewards = [[], []] # Smaller network kwargs = {"policy_kwargs": dict(net_arch=[64])} - env_id = "Pendulum-v0" + env_id = "Pendulum-v1" if algo in [TD3, SAC]: kwargs.update({"action_noise": NormalActionNoise(0.0, 0.1), "learning_starts": 100, "train_freq": 4}) else: diff --git a/tests/test_distributions.py b/tests/test_distributions.py index b894dd478..3652b1850 100644 --- a/tests/test_distributions.py +++ b/tests/test_distributions.py @@ -43,7 +43,7 @@ def test_squashed_gaussian(model_class): """ Test run with squashed Gaussian (notably entropy computation) """ - model = model_class("MlpPolicy", "Pendulum-v0", use_sde=True, n_steps=64, policy_kwargs=dict(squash_output=True)) + model = model_class("MlpPolicy", "Pendulum-v1", use_sde=True, n_steps=64, policy_kwargs=dict(squash_output=True)) model.learn(500) gaussian_mean = th.rand(N_SAMPLES, N_ACTIONS) @@ -57,10 +57,10 @@ def test_squashed_gaussian(model_class): @pytest.fixture() def dummy_model_distribution_obs_and_actions() -> Tuple[A2C, np.array, np.array]: """ - Fixture creating a Pendulum-v0 gym env, an A2C model and sampling 10 random observations and actions from the env + Fixture creating a Pendulum-v1 gym env, an A2C model and sampling 10 random observations and actions from the env :return: A2C model, random observations, random actions """ - env = gym.make("Pendulum-v0") + env = gym.make("Pendulum-v1") model = A2C("MlpPolicy", env, seed=23) random_obs = np.array([env.observation_space.sample() for _ in range(10)]) random_actions = np.array([env.action_space.sample() for _ in range(10)]) diff --git a/tests/test_env_checker.py b/tests/test_env_checker.py index 6364bd4ba..0b0a82d8f 100644 --- a/tests/test_env_checker.py +++ b/tests/test_env_checker.py @@ -11,14 +11,14 @@ class ActionDictTestEnv(gym.Env): observation_space = Box(low=-1.0, high=2.0, shape=(3,), dtype=np.float32) def step(self, action): - observation = np.array([1.0, 1.5, 0.5]) + observation = np.array([1.0, 1.5, 0.5], dtype=self.observation_space.dtype) reward = 1 done = True info = {} return observation, reward, done, info def reset(self): - return np.array([1.0, 1.5, 0.5]) + return np.array([1.0, 1.5, 0.5], dtype=self.observation_space.dtype) def render(self, mode="human"): pass diff --git a/tests/test_envs.py b/tests/test_envs.py index d0434773a..b859ed703 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -27,7 +27,7 @@ ] -@pytest.mark.parametrize("env_id", ["CartPole-v0", "Pendulum-v0"]) +@pytest.mark.parametrize("env_id", ["CartPole-v0", "Pendulum-v1"]) def test_env(env_id): """ Check that environmnent integrated in Gym pass the test. @@ -38,9 +38,9 @@ def test_env(env_id): with pytest.warns(None) as record: check_env(env) - # Pendulum-v0 will produce a warning because the action space is + # Pendulum-v1 will produce a warning because the action space is # in [-2, 2] and not [-1, 1] - if env_id == "Pendulum-v0": + if env_id == "Pendulum-v1": assert len(record) == 1 else: # The other environments must pass without warning diff --git a/tests/test_predict.py b/tests/test_predict.py index 436547b83..853f4d11d 100644 --- a/tests/test_predict.py +++ b/tests/test_predict.py @@ -43,7 +43,7 @@ def test_auto_wrap(model_class): if model_class is DQN: env_name = "CartPole-v0" else: - env_name = "Pendulum-v0" + env_name = "Pendulum-v1" env = gym.make(env_name) eval_env = gym.make(env_name) model = model_class("MlpPolicy", env) @@ -51,7 +51,7 @@ def test_auto_wrap(model_class): @pytest.mark.parametrize("model_class", MODEL_LIST) -@pytest.mark.parametrize("env_id", ["Pendulum-v0", "CartPole-v1"]) +@pytest.mark.parametrize("env_id", ["Pendulum-v1", "CartPole-v1"]) @pytest.mark.parametrize("device", ["cpu", "cuda", "auto"]) def test_predict(model_class, env_id, device): if device == "cuda" and not th.cuda.is_available(): diff --git a/tests/test_run.py b/tests/test_run.py index 67b31c482..223776dfb 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -17,7 +17,7 @@ def test_deterministic_pg(model_class, action_noise): """ model = model_class( "MlpPolicy", - "Pendulum-v0", + "Pendulum-v1", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, @@ -28,13 +28,13 @@ def test_deterministic_pg(model_class, action_noise): model.learn(total_timesteps=300, eval_freq=250) -@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v0"]) +@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v1"]) def test_a2c(env_id): model = A2C("MlpPolicy", env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True) model.learn(total_timesteps=1000, eval_freq=500) -@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v0"]) +@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v1"]) @pytest.mark.parametrize("clip_range_vf", [None, 0.2, -0.2]) def test_ppo(env_id, clip_range_vf): if clip_range_vf is not None and clip_range_vf < 0: @@ -67,7 +67,7 @@ def test_ppo(env_id, clip_range_vf): def test_sac(ent_coef): model = SAC( "MlpPolicy", - "Pendulum-v0", + "Pendulum-v1", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, @@ -84,7 +84,7 @@ def test_n_critics(n_critics): # Test SAC with different number of critics, for TD3, n_critics=1 corresponds to DDPG model = SAC( "MlpPolicy", - "Pendulum-v0", + "Pendulum-v1", policy_kwargs=dict(net_arch=[64, 64], n_critics=n_critics), learning_starts=100, buffer_size=10000, @@ -112,7 +112,7 @@ def test_train_freq(tmp_path, train_freq): model = SAC( "MlpPolicy", - "Pendulum-v0", + "Pendulum-v1", policy_kwargs=dict(net_arch=[64, 64], n_critics=1), learning_starts=100, buffer_size=10000, @@ -133,7 +133,7 @@ def test_train_freq_fail(train_freq): with pytest.raises(ValueError): model = SAC( "MlpPolicy", - "Pendulum-v0", + "Pendulum-v1", policy_kwargs=dict(net_arch=[64, 64], n_critics=1), learning_starts=100, buffer_size=10000, @@ -147,7 +147,7 @@ def test_train_freq_fail(train_freq): def test_offpolicy_multi_env(model_class): kwargs = {} if model_class in [SAC, TD3, DDPG]: - env_id = "Pendulum-v0" + env_id = "Pendulum-v1" policy_kwargs = dict(net_arch=[64], n_critics=1) # Check auto-conversion to VectorizedActionNoise kwargs = dict(action_noise=NormalActionNoise(np.zeros(1), 0.1 * np.ones(1))) diff --git a/tests/test_save_load.py b/tests/test_save_load.py index 69a3f4816..7d810c70e 100644 --- a/tests/test_save_load.py +++ b/tests/test_save_load.py @@ -269,7 +269,7 @@ def test_exclude_include_saved_params(tmp_path, model_class): def test_save_load_pytorch_var(tmp_path): - model = SAC("MlpPolicy", "Pendulum-v0", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1)) + model = SAC("MlpPolicy", "Pendulum-v1", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1)) model.learn(200) save_path = str(tmp_path / "sac_pendulum") model.save(save_path) @@ -286,7 +286,7 @@ def test_save_load_pytorch_var(tmp_path): assert not th.allclose(log_ent_coef_before, log_ent_coef_after) # With a fixed entropy coef - model = SAC("MlpPolicy", "Pendulum-v0", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1)) + model = SAC("MlpPolicy", "Pendulum-v1", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1)) model.learn(200) save_path = str(tmp_path / "sac_pendulum") model.save(save_path) diff --git a/tests/test_sde.py b/tests/test_sde.py index 17ac1501d..0a650a57c 100644 --- a/tests/test_sde.py +++ b/tests/test_sde.py @@ -65,7 +65,7 @@ def test_state_dependent_noise(model_class, use_expln): kwargs = {"learning_starts": 0} if model_class == SAC else {"n_steps": 64} model = model_class( "MlpPolicy", - "Pendulum-v0", + "Pendulum-v1", use_sde=True, seed=None, create_eval_env=True, diff --git a/tests/test_spaces.py b/tests/test_spaces.py index deb09c4e4..54994b2b5 100644 --- a/tests/test_spaces.py +++ b/tests/test_spaces.py @@ -53,10 +53,10 @@ def test_identity_spaces(model_class, env): @pytest.mark.parametrize("model_class", [A2C, DDPG, DQN, PPO, SAC, TD3]) -@pytest.mark.parametrize("env", ["Pendulum-v0", "CartPole-v1"]) +@pytest.mark.parametrize("env", ["Pendulum-v1", "CartPole-v1"]) def test_action_spaces(model_class, env): if model_class in [SAC, DDPG, TD3]: - supported_action_space = env == "Pendulum-v0" + supported_action_space = env == "Pendulum-v1" elif model_class == DQN: supported_action_space = env == "CartPole-v1" elif model_class in [A2C, PPO]: diff --git a/tests/test_tensorboard.py b/tests/test_tensorboard.py index 3f755a7aa..20f58b912 100644 --- a/tests/test_tensorboard.py +++ b/tests/test_tensorboard.py @@ -7,8 +7,8 @@ MODEL_DICT = { "a2c": (A2C, "CartPole-v1"), "ppo": (PPO, "CartPole-v1"), - "sac": (SAC, "Pendulum-v0"), - "td3": (TD3, "Pendulum-v0"), + "sac": (SAC, "Pendulum-v1"), + "td3": (TD3, "Pendulum-v1"), } N_STEPS = 100 diff --git a/tests/test_train_eval_mode.py b/tests/test_train_eval_mode.py index c5eb283b7..1ea2efe67 100644 --- a/tests/test_train_eval_mode.py +++ b/tests/test_train_eval_mode.py @@ -172,7 +172,7 @@ def test_dqn_train_with_batch_norm(): def test_td3_train_with_batch_norm(): model = TD3( "MlpPolicy", - "Pendulum-v0", + "Pendulum-v1", policy_kwargs=dict(net_arch=[16, 16], features_extractor_class=FlattenBatchNormDropoutExtractor), learning_starts=0, tau=0, # do not copy the target @@ -219,7 +219,7 @@ def test_td3_train_with_batch_norm(): def test_sac_train_with_batch_norm(): model = SAC( "MlpPolicy", - "Pendulum-v0", + "Pendulum-v1", policy_kwargs=dict(net_arch=[16, 16], features_extractor_class=FlattenBatchNormDropoutExtractor), learning_starts=0, tau=0, # do not copy the target @@ -257,7 +257,7 @@ def test_sac_train_with_batch_norm(): @pytest.mark.parametrize("model_class", [A2C, PPO]) -@pytest.mark.parametrize("env_id", ["Pendulum-v0", "CartPole-v1"]) +@pytest.mark.parametrize("env_id", ["Pendulum-v1", "CartPole-v1"]) def test_a2c_ppo_train_with_batch_norm(model_class, env_id): model = model_class( "MlpPolicy", @@ -281,7 +281,7 @@ def test_offpolicy_collect_rollout_batch_norm(model_class): if model_class in [DQN]: env_id = "CartPole-v1" else: - env_id = "Pendulum-v0" + env_id = "Pendulum-v1" clone_helper = CLONE_HELPERS[model_class] @@ -308,7 +308,7 @@ def test_offpolicy_collect_rollout_batch_norm(model_class): @pytest.mark.parametrize("model_class", [A2C, PPO]) -@pytest.mark.parametrize("env_id", ["Pendulum-v0", "CartPole-v1"]) +@pytest.mark.parametrize("env_id", ["Pendulum-v1", "CartPole-v1"]) def test_a2c_ppo_collect_rollouts_with_batch_norm(model_class, env_id): model = model_class( "MlpPolicy", @@ -332,7 +332,7 @@ def test_a2c_ppo_collect_rollouts_with_batch_norm(model_class, env_id): @pytest.mark.parametrize("model_class", MODEL_LIST) -@pytest.mark.parametrize("env_id", ["Pendulum-v0", "CartPole-v1"]) +@pytest.mark.parametrize("env_id", ["Pendulum-v1", "CartPole-v1"]) def test_predict_with_dropout_batch_norm(model_class, env_id): if env_id == "CartPole-v1": if model_class in [SAC, TD3]: diff --git a/tests/test_utils.py b/tests/test_utils.py index ea497140e..b07bbe931 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -43,7 +43,6 @@ def test_make_vec_env(env_id, n_envs, vec_env_cls, wrapper_class): @pytest.mark.parametrize("n_envs", [1, 2]) @pytest.mark.parametrize("wrapper_kwargs", [None, dict(clip_reward=False, screen_size=60)]) def test_make_atari_env(env_id, n_envs, wrapper_kwargs): - env_id = "BreakoutNoFrameskip-v4" env = make_atari_env(env_id, n_envs, wrapper_kwargs=wrapper_kwargs, monitor_dir=None, seed=0) assert env.num_envs == n_envs @@ -97,7 +96,7 @@ def test_vec_env_monitor_kwargs(): def test_env_auto_monitor_wrap(): - env = gym.make("Pendulum-v0") + env = gym.make("Pendulum-v1") model = A2C("MlpPolicy", env) assert model.env.env_is_wrapped(Monitor)[0] is True @@ -105,7 +104,7 @@ def test_env_auto_monitor_wrap(): model = A2C("MlpPolicy", env) assert model.env.env_is_wrapped(Monitor)[0] is True - model = A2C("MlpPolicy", "Pendulum-v0") + model = A2C("MlpPolicy", "Pendulum-v1") assert model.env.env_is_wrapped(Monitor)[0] is True @@ -137,7 +136,7 @@ def test_custom_vec_env(tmp_path): def test_evaluate_policy(): - model = A2C("MlpPolicy", "Pendulum-v0", seed=0) + model = A2C("MlpPolicy", "Pendulum-v1", seed=0) n_steps_per_episode, n_eval_episodes = 200, 2 model.n_callback_calls = 0 @@ -167,7 +166,7 @@ def dummy_callback(locals_, _globals): assert len(episode_rewards) == n_eval_episodes # Test that warning is given about no monitor - eval_env = gym.make("Pendulum-v0") + eval_env = gym.make("Pendulum-v1") with pytest.warns(UserWarning): _ = evaluate_policy(model, eval_env, n_eval_episodes) @@ -356,7 +355,7 @@ def test_zip_strict(): def test_is_wrapped(): """Test that is_wrapped correctly detects wraps""" - env = gym.make("Pendulum-v0") + env = gym.make("Pendulum-v1") env = gym.Wrapper(env) assert not is_wrapped(env, Monitor) monitor_env = Monitor(env) @@ -373,11 +372,11 @@ def test_ppo_warnings(): # Only 1 step: advantage normalization will return NaN with pytest.raises(AssertionError): - PPO("MlpPolicy", "Pendulum-v0", n_steps=1) + PPO("MlpPolicy", "Pendulum-v1", n_steps=1) # Truncated mini-batch with pytest.warns(UserWarning): - PPO("MlpPolicy", "Pendulum-v0", n_steps=6, batch_size=8) + PPO("MlpPolicy", "Pendulum-v1", n_steps=6, batch_size=8) def test_get_system_info(): diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py index 0136c2657..c3d1d3065 100644 --- a/tests/test_vec_normalize.py +++ b/tests/test_vec_normalize.py @@ -16,7 +16,7 @@ unwrap_vec_normalize, ) -ENV_ID = "Pendulum-v0" +ENV_ID = "Pendulum-v1" class DummyRewardEnv(gym.Env):