From 243457e1fe809b1e98d690b33c018cfa9ba7e956 Mon Sep 17 00:00:00 2001 From: J K Terry Date: Thu, 16 Sep 2021 09:04:26 -0400 Subject: [PATCH 01/25] fix Atari in CI --- .github/workflows/ci.yml | 5 +++-- setup.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6626122dc..51069c50e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,8 +32,9 @@ jobs: pip install .[extra,tests,docs] # Use headless version pip install opencv-python-headless - # Tmp fix: ROM missing in the newest atari-py version - pip install atari-py==0.2.5 + # Add Atari ROMs + pip install AutoROM + AutoROM -v - name: Build the doc run: | make doc diff --git a/setup.py b/setup.py index cb5785959..7929791e8 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,7 @@ packages=[package for package in find_packages() if package.startswith("stable_baselines3")], package_data={"stable_baselines3": ["py.typed", "version.txt"]}, install_requires=[ - "gym>=0.17,<0.20", # gym 0.20 breaks atari-py behavior + "gym>=0.20", "numpy", "torch>=1.8.1", # For saving models @@ -116,7 +116,7 @@ # For render "opencv-python", # For atari games, - "atari_py~=0.2.0", + "gym[atari]>=0.20.0", "pillow", # Tensorboard support "tensorboard>=2.2.0", From 0d94863ec25c99f35e005e6b97ae9d77af7bf2b7 Mon Sep 17 00:00:00 2001 From: J K Terry Date: Thu, 16 Sep 2021 09:58:30 -0400 Subject: [PATCH 02/25] fix dtype and atari extra --- setup.py | 2 +- tests/test_env_checker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7929791e8..ac95f807a 100644 --- a/setup.py +++ b/setup.py @@ -116,7 +116,7 @@ # For render "opencv-python", # For atari games, - "gym[atari]>=0.20.0", + "ale-py~=0.7", "pillow", # Tensorboard support "tensorboard>=2.2.0", diff --git a/tests/test_env_checker.py b/tests/test_env_checker.py index 6364bd4ba..776e774f7 100644 --- a/tests/test_env_checker.py +++ b/tests/test_env_checker.py @@ -8,7 +8,7 @@ class ActionDictTestEnv(gym.Env): action_space = Dict({"position": Discrete(1), "velocity": Discrete(1)}) - observation_space = Box(low=-1.0, high=2.0, shape=(3,), dtype=np.float32) + observation_space = Box(low=-1.0, high=2.0, shape=(3,), dtype=np.float64) def step(self, action): observation = np.array([1.0, 1.5, 0.5]) From 4899c60db30403e0138baad1b7419255b157cf06 Mon Sep 17 00:00:00 2001 From: J K Terry Date: Thu, 21 Oct 2021 00:26:52 -0400 Subject: [PATCH 03/25] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ac95f807a..393de02cf 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,7 @@ packages=[package for package in find_packages() if package.startswith("stable_baselines3")], package_data={"stable_baselines3": ["py.typed", "version.txt"]}, install_requires=[ - "gym>=0.20", + "gym>=0.21", "numpy", "torch>=1.8.1", # For saving models From 4329f4b31a0b71f7079d28251ef73bd11ca1ad88 Mon Sep 17 00:00:00 2001 From: J K Terry Date: Thu, 21 Oct 2021 00:31:25 -0400 Subject: [PATCH 04/25] remove 3.6 --- .github/workflows/ci.yml | 2 +- Dockerfile | 2 +- README.md | 2 +- docs/conda_env.yml | 2 +- docs/guide/install.rst | 2 +- docs/misc/changelog.rst | 3 +++ 6 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 51069c50e..7d565a325 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8, 3.9] + python-version: [3.7, 3.8, 3.9] steps: - uses: actions/checkout@v2 diff --git a/Dockerfile b/Dockerfile index a946a3601..8dfbbbf4c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ ARG PARENT_IMAGE FROM $PARENT_IMAGE ARG PYTORCH_DEPS=cpuonly -ARG PYTHON_VERSION=3.6 +ARG PYTHON_VERSION=3.7 RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ diff --git a/README.md b/README.md index df1689856..ff28c0f44 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ Documentation is available online: [https://sb3-contrib.readthedocs.io/](https:/ **Note:** Stable-Baselines3 supports PyTorch >= 1.8.1. ### Prerequisites -Stable Baselines3 requires python 3.6+. +Stable Baselines3 requires Python 3.7+. #### Windows 10 diff --git a/docs/conda_env.yml b/docs/conda_env.yml index f2f153df1..fa18053bd 100644 --- a/docs/conda_env.yml +++ b/docs/conda_env.yml @@ -5,7 +5,7 @@ channels: dependencies: - cpuonly=1.0=0 - pip=20.2 - - python=3.6 + - python=3.7 - pytorch=1.8.1=py3.6_cpu_0 - pip: - gym>=0.17.2 diff --git a/docs/guide/install.rst b/docs/guide/install.rst index f43489b9b..7beabb782 100644 --- a/docs/guide/install.rst +++ b/docs/guide/install.rst @@ -6,7 +6,7 @@ Installation Prerequisites ------------- -Stable-Baselines3 requires python 3.6+ and PyTorch >= 1.8.1. +Stable-Baselines3 requires python 3.7+ and PyTorch >= 1.8.1. Windows 10 ~~~~~~~~~~ diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index d7f4775b8..dd5146bf2 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -13,6 +13,7 @@ Breaking Changes: - ``sde_net_arch`` argument in policies is deprecated and will be removed in a future version. - ``_get_latent`` (``ActorCriticPolicy``) was removed + New Features: ^^^^^^^^^^^^^ - Added methods ``get_distribution`` and ``predict_values`` for ``ActorCriticPolicy`` for A2C/PPO/TRPO (@cyprienc) @@ -28,6 +29,8 @@ Bug Fixes: Deprecations: ^^^^^^^^^^^^^ +- Support for Python 3.6 was removed +- Switched minimum Gym version to 0.21.0 Others: ^^^^^^^ From d2ad8fd18ffc9dbe3d356097d3a22129cca8901a Mon Sep 17 00:00:00 2001 From: J K Terry Date: Thu, 21 Oct 2021 00:37:43 -0400 Subject: [PATCH 05/25] note about how to install Atari --- docs/conda_env.yml | 2 +- docs/guide/examples.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/conda_env.yml b/docs/conda_env.yml index fa18053bd..7e4da2a3f 100644 --- a/docs/conda_env.yml +++ b/docs/conda_env.yml @@ -6,7 +6,7 @@ dependencies: - cpuonly=1.0=0 - pip=20.2 - python=3.7 - - pytorch=1.8.1=py3.6_cpu_0 + - pytorch=1.8.1=py3.7_cpu_0 - pip: - gym>=0.17.2 - cloudpickle diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst index 7e870f6e1..6c84224d0 100644 --- a/docs/guide/examples.rst +++ b/docs/guide/examples.rst @@ -290,7 +290,7 @@ Atari Games Training a RL agent on Atari games is straightforward thanks to ``make_atari_env`` helper function. It will do `all the preprocessing `_ -and multiprocessing for you. +and multiprocessing for you. To install the Atari environments, run the command``pip install gym[atari, accept-rom-license]`` to install the Atari environments and ROMs. .. image:: ../_static/img/colab-badge.svg :target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/atari_games.ipynb From cd293012da922ae199e7bb84034546f3a9751e17 Mon Sep 17 00:00:00 2001 From: J K Terry Date: Thu, 21 Oct 2021 00:51:05 -0400 Subject: [PATCH 06/25] pendulum-v1 --- docs/guide/callbacks.rst | 18 +++++++++--------- docs/guide/custom_policy.rst | 2 +- docs/guide/examples.rst | 2 +- docs/guide/export.rst | 2 +- docs/guide/tensorboard.rst | 8 ++++---- docs/modules/ddpg.rst | 2 +- docs/modules/ppo.rst | 2 +- docs/modules/sac.rst | 2 +- docs/modules/td3.rst | 2 +- tests/test_callbacks.py | 4 ++-- tests/test_custom_policy.py | 6 +++--- tests/test_deterministic.py | 2 +- tests/test_distributions.py | 6 +++--- tests/test_envs.py | 6 +++--- tests/test_predict.py | 4 ++-- tests/test_run.py | 14 +++++++------- tests/test_save_load.py | 4 ++-- tests/test_sde.py | 2 +- tests/test_spaces.py | 4 ++-- tests/test_tensorboard.py | 4 ++-- tests/test_train_eval_mode.py | 12 ++++++------ tests/test_utils.py | 14 +++++++------- tests/test_vec_normalize.py | 2 +- 23 files changed, 62 insertions(+), 62 deletions(-) diff --git a/docs/guide/callbacks.rst b/docs/guide/callbacks.rst index 279664171..23cf54aa3 100644 --- a/docs/guide/callbacks.rst +++ b/docs/guide/callbacks.rst @@ -174,7 +174,7 @@ and optionally a prefix for the checkpoints (``rl_model`` by default). checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', name_prefix='rl_model') - model = SAC('MlpPolicy', 'Pendulum-v0') + model = SAC('MlpPolicy', 'pendulum-v1') model.learn(2000, callback=checkpoint_callback) @@ -206,13 +206,13 @@ It will save the best model if ``best_model_save_path`` folder is specified and from stable_baselines3.common.callbacks import EvalCallback # Separate evaluation env - eval_env = gym.make('Pendulum-v0') + eval_env = gym.make('pendulum-v1') # Use deterministic actions for evaluation eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=500, deterministic=True, render=False) - model = SAC('MlpPolicy', 'Pendulum-v0') + model = SAC('MlpPolicy', 'pendulum-v1') model.learn(5000, callback=eval_callback) @@ -234,13 +234,13 @@ Alternatively, you can pass directly a list of callbacks to the ``learn()`` meth checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/') # Separate evaluation env - eval_env = gym.make('Pendulum-v0') + eval_env = gym.make('pendulum-v1') eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/best_model', log_path='./logs/results', eval_freq=500) # Create the callback list callback = CallbackList([checkpoint_callback, eval_callback]) - model = SAC('MlpPolicy', 'Pendulum-v0') + model = SAC('MlpPolicy', 'pendulum-v1') # Equivalent to: # model.learn(5000, callback=[checkpoint_callback, eval_callback]) model.learn(5000, callback=callback) @@ -263,12 +263,12 @@ It must be used with the :ref:`EvalCallback` and use the event triggered by a ne from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold # Separate evaluation env - eval_env = gym.make('Pendulum-v0') + eval_env = gym.make('pendulum-v1') # Stop training when the model reaches the reward threshold callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-200, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1) - model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1) + model = SAC('MlpPolicy', 'pendulum-v1', verbose=1) # Almost infinite number of timesteps, but the training will stop # early as soon as the reward threshold is reached model.learn(int(1e10), callback=eval_callback) @@ -299,7 +299,7 @@ An :ref:`EventCallback` that will trigger its child callback every ``n_steps`` t checkpoint_on_event = CheckpointCallback(save_freq=1, save_path='./logs/') event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) - model = PPO('MlpPolicy', 'Pendulum-v0', verbose=1) + model = PPO('MlpPolicy', 'pendulum-v1', verbose=1) model.learn(int(2e4), callback=event_callback) @@ -328,7 +328,7 @@ and in total for ``max_episodes * n_envs`` episodes. # Stops training when the model reaches the maximum number of episodes callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1) - model = A2C('MlpPolicy', 'Pendulum-v0', verbose=1) + model = A2C('MlpPolicy', 'pendulum-v1', verbose=1) # Almost infinite number of timesteps, but the training will stop # early as soon as the max number of episodes is reached model.learn(int(1e10), callback=callback_max_episodes) diff --git a/docs/guide/custom_policy.rst b/docs/guide/custom_policy.rst index 08c0b601a..159c6eea5 100644 --- a/docs/guide/custom_policy.rst +++ b/docs/guide/custom_policy.rst @@ -401,5 +401,5 @@ you only need to specify ``net_arch=[256, 256]`` (here, two hidden layers of 256 # Custom critic architecture with two layers of 400 and 300 units policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[400, 300])) # Create the agent - model = SAC("MlpPolicy", "Pendulum-v0", policy_kwargs=policy_kwargs, verbose=1) + model = SAC("MlpPolicy", "pendulum-v1", policy_kwargs=policy_kwargs, verbose=1) model.learn(5000) diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst index 6c84224d0..97c232340 100644 --- a/docs/guide/examples.rst +++ b/docs/guide/examples.rst @@ -533,7 +533,7 @@ Behind the scene, SB3 uses an :ref:`EvalCallback `. # Create the model, the training environment # and the test environment (for evaluation) - model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1, + model = SAC('MlpPolicy', 'pendulum-v1', verbose=1, learning_rate=1e-3, create_eval_env=True) # Evaluate the model every 1000 steps on 5 test episodes diff --git a/docs/guide/export.rst b/docs/guide/export.rst index 8d1ff0e7a..a32c85c79 100644 --- a/docs/guide/export.rst +++ b/docs/guide/export.rst @@ -62,7 +62,7 @@ For PPO, assuming a shared feature extactor. action_hidden, value_hidden = self.extractor(observation) return self.action_net(action_hidden), self.value_net(value_hidden) - # Example: model = PPO("MlpPolicy", "Pendulum-v0") + # Example: model = PPO("MlpPolicy", "pendulum-v1") model = PPO.load("PathToTrainedModel.zip") model.policy.to("cpu") onnxable_model = OnnxablePolicy(model.policy.mlp_extractor, model.policy.action_net, model.policy.value_net) diff --git a/docs/guide/tensorboard.rst b/docs/guide/tensorboard.rst index 833ab1b5a..7348fd45d 100644 --- a/docs/guide/tensorboard.rst +++ b/docs/guide/tensorboard.rst @@ -61,7 +61,7 @@ Here is a simple example on how to log both additional tensor or arbitrary scala from stable_baselines3 import SAC from stable_baselines3.common.callbacks import BaseCallback - model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1) + model = SAC("MlpPolicy", "pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) class TensorboardCallback(BaseCallback): @@ -104,7 +104,7 @@ Here is an example of how to render an image to TensorBoard at regular intervals from stable_baselines3.common.callbacks import BaseCallback from stable_baselines3.common.logger import Image - model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1) + model = SAC("MlpPolicy", "pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) class ImageRecorderCallback(BaseCallback): @@ -141,7 +141,7 @@ Here is an example of how to store a plot in TensorBoard at regular intervals: from stable_baselines3.common.callbacks import BaseCallback from stable_baselines3.common.logger import Figure - model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1) + model = SAC("MlpPolicy", "pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) class FigureRecorderCallback(BaseCallback): @@ -251,7 +251,7 @@ can get direct access to the underlying SummaryWriter in a callback: - model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1) + model = SAC("MlpPolicy", "pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) class SummaryWriterCallback(BaseCallback): diff --git a/docs/modules/ddpg.rst b/docs/modules/ddpg.rst index dd07c22fd..059da5f05 100644 --- a/docs/modules/ddpg.rst +++ b/docs/modules/ddpg.rst @@ -67,7 +67,7 @@ This example is only to demonstrate the use of the library and its functions, an from stable_baselines3 import DDPG from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise - env = gym.make("Pendulum-v0") + env = gym.make("pendulum-v1") # The noise objects for DDPG n_actions = env.action_space.shape[-1] diff --git a/docs/modules/ppo.rst b/docs/modules/ppo.rst index 3b0d7204a..8fee0356b 100644 --- a/docs/modules/ppo.rst +++ b/docs/modules/ppo.rst @@ -50,7 +50,7 @@ Example This example is only to demonstrate the use of the library and its functions, and the trained agents may not solve the environments. Optimized hyperparameters can be found in RL Zoo `repository `_. -Train a PPO agent on ``Pendulum-v0`` using 4 environments. +Train a PPO agent on ``pendulum-v1`` using 4 environments. .. code-block:: python diff --git a/docs/modules/sac.rst b/docs/modules/sac.rst index 2f6804702..48a90783c 100644 --- a/docs/modules/sac.rst +++ b/docs/modules/sac.rst @@ -73,7 +73,7 @@ This example is only to demonstrate the use of the library and its functions, an from stable_baselines3 import SAC - env = gym.make("Pendulum-v0") + env = gym.make("pendulum-v1") model = SAC("MlpPolicy", env, verbose=1) model.learn(total_timesteps=10000, log_interval=4) diff --git a/docs/modules/td3.rst b/docs/modules/td3.rst index 33cb38f5f..03aafe726 100644 --- a/docs/modules/td3.rst +++ b/docs/modules/td3.rst @@ -67,7 +67,7 @@ This example is only to demonstrate the use of the library and its functions, an from stable_baselines3 import TD3 from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise - env = gym.make("Pendulum-v0") + env = gym.make("pendulum-v1") # The noise objects for TD3 n_actions = env.action_space.shape[-1] diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py index c94bb8f75..6a8af11bc 100644 --- a/tests/test_callbacks.py +++ b/tests/test_callbacks.py @@ -75,7 +75,7 @@ def test_callbacks(tmp_path, model_class): if model_class in [A2C, PPO]: max_episodes = 1 n_envs = 2 - # Pendulum-v0 has a timelimit of 200 timesteps + # pendulum-v1 has a timelimit of 200 timesteps max_episode_length = 200 envs = make_vec_env(env_name, n_envs=n_envs, seed=0) @@ -99,7 +99,7 @@ def select_env(model_class) -> str: if model_class is DQN: return "CartPole-v0" else: - return "Pendulum-v0" + return "pendulum-v1" def test_eval_callback_vec_env(): diff --git a/tests/test_custom_policy.py b/tests/test_custom_policy.py index 02fa7b4a0..5221102c5 100644 --- a/tests/test_custom_policy.py +++ b/tests/test_custom_policy.py @@ -25,7 +25,7 @@ def test_flexible_mlp(model_class, net_arch): @pytest.mark.parametrize("net_arch", [[], [4], [4, 4], dict(qf=[8], pi=[8, 4])]) @pytest.mark.parametrize("model_class", [SAC, TD3]) def test_custom_offpolicy(model_class, net_arch): - _ = model_class("MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=net_arch), learning_starts=100).learn(300) + _ = model_class("MlpPolicy", "pendulum-v1", policy_kwargs=dict(net_arch=net_arch), learning_starts=100).learn(300) @pytest.mark.parametrize("model_class", [A2C, PPO, SAC, TD3]) @@ -38,12 +38,12 @@ def test_custom_optimizer(model_class, optimizer_kwargs): kwargs = dict(n_steps=64) policy_kwargs = dict(optimizer_class=th.optim.AdamW, optimizer_kwargs=optimizer_kwargs, net_arch=[32]) - _ = model_class("MlpPolicy", "Pendulum-v0", policy_kwargs=policy_kwargs, **kwargs).learn(300) + _ = model_class("MlpPolicy", "pendulum-v1", policy_kwargs=policy_kwargs, **kwargs).learn(300) def test_tf_like_rmsprop_optimizer(): policy_kwargs = dict(optimizer_class=RMSpropTFLike, net_arch=[32]) - _ = A2C("MlpPolicy", "Pendulum-v0", policy_kwargs=policy_kwargs).learn(500) + _ = A2C("MlpPolicy", "pendulum-v1", policy_kwargs=policy_kwargs).learn(500) def test_dqn_custom_policy(): diff --git a/tests/test_deterministic.py b/tests/test_deterministic.py index 16cdcfacf..e3a4b240c 100644 --- a/tests/test_deterministic.py +++ b/tests/test_deterministic.py @@ -14,7 +14,7 @@ def test_deterministic_training_common(algo): # Smaller network kwargs = {"policy_kwargs": dict(net_arch=[64])} if algo in [TD3, SAC]: - env_id = "Pendulum-v0" + env_id = "pendulum-v1" kwargs.update({"action_noise": NormalActionNoise(0.0, 0.1), "learning_starts": 100}) else: env_id = "CartPole-v1" diff --git a/tests/test_distributions.py b/tests/test_distributions.py index b894dd478..8397107d5 100644 --- a/tests/test_distributions.py +++ b/tests/test_distributions.py @@ -43,7 +43,7 @@ def test_squashed_gaussian(model_class): """ Test run with squashed Gaussian (notably entropy computation) """ - model = model_class("MlpPolicy", "Pendulum-v0", use_sde=True, n_steps=64, policy_kwargs=dict(squash_output=True)) + model = model_class("MlpPolicy", "pendulum-v1", use_sde=True, n_steps=64, policy_kwargs=dict(squash_output=True)) model.learn(500) gaussian_mean = th.rand(N_SAMPLES, N_ACTIONS) @@ -57,10 +57,10 @@ def test_squashed_gaussian(model_class): @pytest.fixture() def dummy_model_distribution_obs_and_actions() -> Tuple[A2C, np.array, np.array]: """ - Fixture creating a Pendulum-v0 gym env, an A2C model and sampling 10 random observations and actions from the env + Fixture creating a pendulum-v1 gym env, an A2C model and sampling 10 random observations and actions from the env :return: A2C model, random observations, random actions """ - env = gym.make("Pendulum-v0") + env = gym.make("pendulum-v1") model = A2C("MlpPolicy", env, seed=23) random_obs = np.array([env.observation_space.sample() for _ in range(10)]) random_actions = np.array([env.action_space.sample() for _ in range(10)]) diff --git a/tests/test_envs.py b/tests/test_envs.py index 645c17e3f..b8e075689 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -27,7 +27,7 @@ ] -@pytest.mark.parametrize("env_id", ["CartPole-v0", "Pendulum-v0"]) +@pytest.mark.parametrize("env_id", ["CartPole-v0", "pendulum-v1"]) def test_env(env_id): """ Check that environmnent integrated in Gym pass the test. @@ -38,9 +38,9 @@ def test_env(env_id): with pytest.warns(None) as record: check_env(env) - # Pendulum-v0 will produce a warning because the action space is + # pendulum-v1 will produce a warning because the action space is # in [-2, 2] and not [-1, 1] - if env_id == "Pendulum-v0": + if env_id == "pendulum-v1": assert len(record) == 1 else: # The other environments must pass without warning diff --git a/tests/test_predict.py b/tests/test_predict.py index 436547b83..3b28b1caf 100644 --- a/tests/test_predict.py +++ b/tests/test_predict.py @@ -43,7 +43,7 @@ def test_auto_wrap(model_class): if model_class is DQN: env_name = "CartPole-v0" else: - env_name = "Pendulum-v0" + env_name = "pendulum-v1" env = gym.make(env_name) eval_env = gym.make(env_name) model = model_class("MlpPolicy", env) @@ -51,7 +51,7 @@ def test_auto_wrap(model_class): @pytest.mark.parametrize("model_class", MODEL_LIST) -@pytest.mark.parametrize("env_id", ["Pendulum-v0", "CartPole-v1"]) +@pytest.mark.parametrize("env_id", ["pendulum-v1", "CartPole-v1"]) @pytest.mark.parametrize("device", ["cpu", "cuda", "auto"]) def test_predict(model_class, env_id, device): if device == "cuda" and not th.cuda.is_available(): diff --git a/tests/test_run.py b/tests/test_run.py index c588a0257..ae14adb4f 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -15,7 +15,7 @@ def test_deterministic_pg(model_class, action_noise): """ model = model_class( "MlpPolicy", - "Pendulum-v0", + "pendulum-v1", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, @@ -26,13 +26,13 @@ def test_deterministic_pg(model_class, action_noise): model.learn(total_timesteps=300, eval_freq=250) -@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v0"]) +@pytest.mark.parametrize("env_id", ["CartPole-v1", "pendulum-v1"]) def test_a2c(env_id): model = A2C("MlpPolicy", env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True) model.learn(total_timesteps=1000, eval_freq=500) -@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v0"]) +@pytest.mark.parametrize("env_id", ["CartPole-v1", "pendulum-v1"]) @pytest.mark.parametrize("clip_range_vf", [None, 0.2, -0.2]) def test_ppo(env_id, clip_range_vf): if clip_range_vf is not None and clip_range_vf < 0: @@ -65,7 +65,7 @@ def test_ppo(env_id, clip_range_vf): def test_sac(ent_coef): model = SAC( "MlpPolicy", - "Pendulum-v0", + "pendulum-v1", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, @@ -82,7 +82,7 @@ def test_n_critics(n_critics): # Test SAC with different number of critics, for TD3, n_critics=1 corresponds to DDPG model = SAC( "MlpPolicy", - "Pendulum-v0", + "pendulum-v1", policy_kwargs=dict(net_arch=[64, 64], n_critics=n_critics), learning_starts=100, buffer_size=10000, @@ -110,7 +110,7 @@ def test_train_freq(tmp_path, train_freq): model = SAC( "MlpPolicy", - "Pendulum-v0", + "pendulum-v1", policy_kwargs=dict(net_arch=[64, 64], n_critics=1), learning_starts=100, buffer_size=10000, @@ -131,7 +131,7 @@ def test_train_freq_fail(train_freq): with pytest.raises(ValueError): model = SAC( "MlpPolicy", - "Pendulum-v0", + "pendulum-v1", policy_kwargs=dict(net_arch=[64, 64], n_critics=1), learning_starts=100, buffer_size=10000, diff --git a/tests/test_save_load.py b/tests/test_save_load.py index 7b1fef5be..e93334b09 100644 --- a/tests/test_save_load.py +++ b/tests/test_save_load.py @@ -234,7 +234,7 @@ def test_exclude_include_saved_params(tmp_path, model_class): def test_save_load_pytorch_var(tmp_path): - model = SAC("MlpPolicy", "Pendulum-v0", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1)) + model = SAC("MlpPolicy", "pendulum-v1", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1)) model.learn(200) save_path = str(tmp_path / "sac_pendulum") model.save(save_path) @@ -251,7 +251,7 @@ def test_save_load_pytorch_var(tmp_path): assert not th.allclose(log_ent_coef_before, log_ent_coef_after) # With a fixed entropy coef - model = SAC("MlpPolicy", "Pendulum-v0", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1)) + model = SAC("MlpPolicy", "pendulum-v1", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1)) model.learn(200) save_path = str(tmp_path / "sac_pendulum") model.save(save_path) diff --git a/tests/test_sde.py b/tests/test_sde.py index e20b01d74..6fda390ea 100644 --- a/tests/test_sde.py +++ b/tests/test_sde.py @@ -65,7 +65,7 @@ def test_state_dependent_noise(model_class, use_expln): kwargs = {"learning_starts": 0} if model_class == SAC else {"n_steps": 64} model = model_class( "MlpPolicy", - "Pendulum-v0", + "pendulum-v1", use_sde=True, seed=None, create_eval_env=True, diff --git a/tests/test_spaces.py b/tests/test_spaces.py index 1c66d045c..0e44f4256 100644 --- a/tests/test_spaces.py +++ b/tests/test_spaces.py @@ -53,10 +53,10 @@ def test_identity_spaces(model_class, env): @pytest.mark.parametrize("model_class", [A2C, DDPG, DQN, PPO, SAC, TD3]) -@pytest.mark.parametrize("env", ["Pendulum-v0", "CartPole-v1"]) +@pytest.mark.parametrize("env", ["pendulum-v1", "CartPole-v1"]) def test_action_spaces(model_class, env): if model_class in [SAC, DDPG, TD3]: - supported_action_space = env == "Pendulum-v0" + supported_action_space = env == "pendulum-v1" elif model_class == DQN: supported_action_space = env == "CartPole-v1" elif model_class in [A2C, PPO]: diff --git a/tests/test_tensorboard.py b/tests/test_tensorboard.py index 3f755a7aa..2f9b68087 100644 --- a/tests/test_tensorboard.py +++ b/tests/test_tensorboard.py @@ -7,8 +7,8 @@ MODEL_DICT = { "a2c": (A2C, "CartPole-v1"), "ppo": (PPO, "CartPole-v1"), - "sac": (SAC, "Pendulum-v0"), - "td3": (TD3, "Pendulum-v0"), + "sac": (SAC, "pendulum-v1"), + "td3": (TD3, "pendulum-v1"), } N_STEPS = 100 diff --git a/tests/test_train_eval_mode.py b/tests/test_train_eval_mode.py index c5eb283b7..d26fe5661 100644 --- a/tests/test_train_eval_mode.py +++ b/tests/test_train_eval_mode.py @@ -172,7 +172,7 @@ def test_dqn_train_with_batch_norm(): def test_td3_train_with_batch_norm(): model = TD3( "MlpPolicy", - "Pendulum-v0", + "pendulum-v1", policy_kwargs=dict(net_arch=[16, 16], features_extractor_class=FlattenBatchNormDropoutExtractor), learning_starts=0, tau=0, # do not copy the target @@ -219,7 +219,7 @@ def test_td3_train_with_batch_norm(): def test_sac_train_with_batch_norm(): model = SAC( "MlpPolicy", - "Pendulum-v0", + "pendulum-v1", policy_kwargs=dict(net_arch=[16, 16], features_extractor_class=FlattenBatchNormDropoutExtractor), learning_starts=0, tau=0, # do not copy the target @@ -257,7 +257,7 @@ def test_sac_train_with_batch_norm(): @pytest.mark.parametrize("model_class", [A2C, PPO]) -@pytest.mark.parametrize("env_id", ["Pendulum-v0", "CartPole-v1"]) +@pytest.mark.parametrize("env_id", ["pendulum-v1", "CartPole-v1"]) def test_a2c_ppo_train_with_batch_norm(model_class, env_id): model = model_class( "MlpPolicy", @@ -281,7 +281,7 @@ def test_offpolicy_collect_rollout_batch_norm(model_class): if model_class in [DQN]: env_id = "CartPole-v1" else: - env_id = "Pendulum-v0" + env_id = "pendulum-v1" clone_helper = CLONE_HELPERS[model_class] @@ -308,7 +308,7 @@ def test_offpolicy_collect_rollout_batch_norm(model_class): @pytest.mark.parametrize("model_class", [A2C, PPO]) -@pytest.mark.parametrize("env_id", ["Pendulum-v0", "CartPole-v1"]) +@pytest.mark.parametrize("env_id", ["pendulum-v1", "CartPole-v1"]) def test_a2c_ppo_collect_rollouts_with_batch_norm(model_class, env_id): model = model_class( "MlpPolicy", @@ -332,7 +332,7 @@ def test_a2c_ppo_collect_rollouts_with_batch_norm(model_class, env_id): @pytest.mark.parametrize("model_class", MODEL_LIST) -@pytest.mark.parametrize("env_id", ["Pendulum-v0", "CartPole-v1"]) +@pytest.mark.parametrize("env_id", ["pendulum-v1", "CartPole-v1"]) def test_predict_with_dropout_batch_norm(model_class, env_id): if env_id == "CartPole-v1": if model_class in [SAC, TD3]: diff --git a/tests/test_utils.py b/tests/test_utils.py index f4092d7ac..d70bcdb1a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -95,7 +95,7 @@ def test_vec_env_monitor_kwargs(): def test_env_auto_monitor_wrap(): - env = gym.make("Pendulum-v0") + env = gym.make("pendulum-v1") model = A2C("MlpPolicy", env) assert model.env.env_is_wrapped(Monitor)[0] is True @@ -103,7 +103,7 @@ def test_env_auto_monitor_wrap(): model = A2C("MlpPolicy", env) assert model.env.env_is_wrapped(Monitor)[0] is True - model = A2C("MlpPolicy", "Pendulum-v0") + model = A2C("MlpPolicy", "pendulum-v1") assert model.env.env_is_wrapped(Monitor)[0] is True @@ -135,7 +135,7 @@ def test_custom_vec_env(tmp_path): def test_evaluate_policy(): - model = A2C("MlpPolicy", "Pendulum-v0", seed=0) + model = A2C("MlpPolicy", "pendulum-v1", seed=0) n_steps_per_episode, n_eval_episodes = 200, 2 model.n_callback_calls = 0 @@ -165,7 +165,7 @@ def dummy_callback(locals_, _globals): assert len(episode_rewards) == n_eval_episodes # Test that warning is given about no monitor - eval_env = gym.make("Pendulum-v0") + eval_env = gym.make("pendulum-v1") with pytest.warns(UserWarning): _ = evaluate_policy(model, eval_env, n_eval_episodes) @@ -354,7 +354,7 @@ def test_zip_strict(): def test_is_wrapped(): """Test that is_wrapped correctly detects wraps""" - env = gym.make("Pendulum-v0") + env = gym.make("pendulum-v1") env = gym.Wrapper(env) assert not is_wrapped(env, Monitor) monitor_env = Monitor(env) @@ -371,8 +371,8 @@ def test_ppo_warnings(): # Only 1 step: advantage normalization will return NaN with pytest.raises(AssertionError): - PPO("MlpPolicy", "Pendulum-v0", n_steps=1) + PPO("MlpPolicy", "pendulum-v1", n_steps=1) # Truncated mini-batch with pytest.warns(UserWarning): - PPO("MlpPolicy", "Pendulum-v0", n_steps=6, batch_size=8) + PPO("MlpPolicy", "pendulum-v1", n_steps=6, batch_size=8) diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py index 659174b8e..ca6b2ca47 100644 --- a/tests/test_vec_normalize.py +++ b/tests/test_vec_normalize.py @@ -14,7 +14,7 @@ unwrap_vec_normalize, ) -ENV_ID = "Pendulum-v0" +ENV_ID = "pendulum-v1" class DummyRewardEnv(gym.Env): From e01e535fd890e198b2d208884ac370a18f8b59c0 Mon Sep 17 00:00:00 2001 From: J K Terry Date: Thu, 21 Oct 2021 00:55:53 -0400 Subject: [PATCH 07/25] atari v5 --- docs/guide/custom_policy.rst | 2 +- docs/guide/examples.rst | 2 +- docs/modules/dqn.rst | 2 +- tests/test_utils.py | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/guide/custom_policy.rst b/docs/guide/custom_policy.rst index 159c6eea5..94e11166b 100644 --- a/docs/guide/custom_policy.rst +++ b/docs/guide/custom_policy.rst @@ -145,7 +145,7 @@ that derives from ``BaseFeaturesExtractor`` and then pass it to the model when t features_extractor_class=CustomCNN, features_extractor_kwargs=dict(features_dim=128), ) - model = PPO("CnnPolicy", "BreakoutNoFrameskip-v4", policy_kwargs=policy_kwargs, verbose=1) + model = PPO("CnnPolicy", "BreakoutNoFrameskip-v5", policy_kwargs=policy_kwargs, verbose=1) model.learn(1000) diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst index 97c232340..b68579603 100644 --- a/docs/guide/examples.rst +++ b/docs/guide/examples.rst @@ -305,7 +305,7 @@ and multiprocessing for you. To install the Atari environments, run the command` # There already exists an environment generator # that will make and wrap atari environments correctly. # Here we are also multi-worker training (n_envs=4 => 4 environments) - env = make_atari_env('PongNoFrameskip-v4', n_envs=4, seed=0) + env = make_atari_env('PongNoFrameskip-v5', n_envs=4, seed=0) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) diff --git a/docs/modules/dqn.rst b/docs/modules/dqn.rst index 0c497ada3..9a1ea96d6 100644 --- a/docs/modules/dqn.rst +++ b/docs/modules/dqn.rst @@ -99,7 +99,7 @@ Clone the `rl-zoo repo `_: cd rl-baselines3-zoo/ -Run the benchmark (replace ``$ENV_ID`` by the env id, for instance ``BreakoutNoFrameskip-v4``): +Run the benchmark (replace ``$ENV_ID`` by the env id, for instance ``BreakoutNoFrameskip-v5``): .. code-block:: bash diff --git a/tests/test_utils.py b/tests/test_utils.py index d70bcdb1a..f26962975 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -37,11 +37,11 @@ def test_make_vec_env(env_id, n_envs, vec_env_cls, wrapper_class): env.close() -@pytest.mark.parametrize("env_id", ["BreakoutNoFrameskip-v4"]) +@pytest.mark.parametrize("env_id", ["BreakoutNoFrameskip-v5"]) @pytest.mark.parametrize("n_envs", [1, 2]) @pytest.mark.parametrize("wrapper_kwargs", [None, dict(clip_reward=False, screen_size=60)]) def test_make_atari_env(env_id, n_envs, wrapper_kwargs): - env_id = "BreakoutNoFrameskip-v4" + env_id = "BreakoutNoFrameskip-v5" env = make_atari_env(env_id, n_envs, wrapper_kwargs=wrapper_kwargs, monitor_dir=None, seed=0) assert env.num_envs == n_envs @@ -79,14 +79,14 @@ def test_vec_env_monitor_kwargs(): env = make_vec_env("MountainCarContinuous-v0", n_envs=1, seed=0, monitor_kwargs={"allow_early_resets": False}) assert env.get_attr("allow_early_resets")[0] is False - env = make_atari_env("BreakoutNoFrameskip-v4", n_envs=1, seed=0, monitor_kwargs={"allow_early_resets": False}) + env = make_atari_env("BreakoutNoFrameskip-v5", n_envs=1, seed=0, monitor_kwargs={"allow_early_resets": False}) assert env.get_attr("allow_early_resets")[0] is False env = make_vec_env("MountainCarContinuous-v0", n_envs=1, seed=0, monitor_kwargs={"allow_early_resets": True}) assert env.get_attr("allow_early_resets")[0] is True env = make_atari_env( - "BreakoutNoFrameskip-v4", + "BreakoutNoFrameskip-v5", n_envs=1, seed=0, monitor_kwargs={"allow_early_resets": True}, From c4e4f0ab430ce5be0e2f6604a6ad9c4cc3911fb2 Mon Sep 17 00:00:00 2001 From: J K Terry Date: Thu, 21 Oct 2021 01:19:15 -0400 Subject: [PATCH 08/25] black --- tests/test_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index 226d2d468..d5fcf599a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -378,6 +378,7 @@ def test_ppo_warnings(): with pytest.warns(UserWarning): PPO("MlpPolicy", "pendulum-v1", n_steps=6, batch_size=8) + def test_get_system_info(): info, info_str = get_system_info(print_info=True) assert info["Stable-Baselines3"] == str(sb3.__version__) From 4279d632a31f375803218d3174b6c3944f66ae54 Mon Sep 17 00:00:00 2001 From: J K Terry Date: Thu, 21 Oct 2021 01:43:43 -0400 Subject: [PATCH 09/25] fix pendulum capitalization --- docs/guide/callbacks.rst | 18 +++++++++--------- docs/guide/custom_policy.rst | 2 +- docs/guide/examples.rst | 2 +- docs/guide/export.rst | 2 +- docs/guide/tensorboard.rst | 8 ++++---- docs/modules/ddpg.rst | 2 +- docs/modules/sac.rst | 2 +- docs/modules/td3.rst | 2 +- tests/test_callbacks.py | 4 ++-- tests/test_custom_policy.py | 6 +++--- tests/test_deterministic.py | 2 +- tests/test_distributions.py | 6 +++--- tests/test_envs.py | 6 +++--- tests/test_predict.py | 4 ++-- tests/test_run.py | 14 +++++++------- tests/test_save_load.py | 4 ++-- tests/test_sde.py | 2 +- tests/test_spaces.py | 4 ++-- tests/test_tensorboard.py | 4 ++-- tests/test_train_eval_mode.py | 12 ++++++------ tests/test_utils.py | 14 +++++++------- tests/test_vec_normalize.py | 2 +- 22 files changed, 61 insertions(+), 61 deletions(-) diff --git a/docs/guide/callbacks.rst b/docs/guide/callbacks.rst index 23cf54aa3..19bccb22c 100644 --- a/docs/guide/callbacks.rst +++ b/docs/guide/callbacks.rst @@ -174,7 +174,7 @@ and optionally a prefix for the checkpoints (``rl_model`` by default). checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', name_prefix='rl_model') - model = SAC('MlpPolicy', 'pendulum-v1') + model = SAC('MlpPolicy', 'Pendulum-v1') model.learn(2000, callback=checkpoint_callback) @@ -206,13 +206,13 @@ It will save the best model if ``best_model_save_path`` folder is specified and from stable_baselines3.common.callbacks import EvalCallback # Separate evaluation env - eval_env = gym.make('pendulum-v1') + eval_env = gym.make('Pendulum-v1') # Use deterministic actions for evaluation eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=500, deterministic=True, render=False) - model = SAC('MlpPolicy', 'pendulum-v1') + model = SAC('MlpPolicy', 'Pendulum-v1') model.learn(5000, callback=eval_callback) @@ -234,13 +234,13 @@ Alternatively, you can pass directly a list of callbacks to the ``learn()`` meth checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/') # Separate evaluation env - eval_env = gym.make('pendulum-v1') + eval_env = gym.make('Pendulum-v1') eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/best_model', log_path='./logs/results', eval_freq=500) # Create the callback list callback = CallbackList([checkpoint_callback, eval_callback]) - model = SAC('MlpPolicy', 'pendulum-v1') + model = SAC('MlpPolicy', 'Pendulum-v1') # Equivalent to: # model.learn(5000, callback=[checkpoint_callback, eval_callback]) model.learn(5000, callback=callback) @@ -263,12 +263,12 @@ It must be used with the :ref:`EvalCallback` and use the event triggered by a ne from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold # Separate evaluation env - eval_env = gym.make('pendulum-v1') + eval_env = gym.make('Pendulum-v1') # Stop training when the model reaches the reward threshold callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-200, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1) - model = SAC('MlpPolicy', 'pendulum-v1', verbose=1) + model = SAC('MlpPolicy', 'Pendulum-v1', verbose=1) # Almost infinite number of timesteps, but the training will stop # early as soon as the reward threshold is reached model.learn(int(1e10), callback=eval_callback) @@ -299,7 +299,7 @@ An :ref:`EventCallback` that will trigger its child callback every ``n_steps`` t checkpoint_on_event = CheckpointCallback(save_freq=1, save_path='./logs/') event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) - model = PPO('MlpPolicy', 'pendulum-v1', verbose=1) + model = PPO('MlpPolicy', 'Pendulum-v1', verbose=1) model.learn(int(2e4), callback=event_callback) @@ -328,7 +328,7 @@ and in total for ``max_episodes * n_envs`` episodes. # Stops training when the model reaches the maximum number of episodes callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1) - model = A2C('MlpPolicy', 'pendulum-v1', verbose=1) + model = A2C('MlpPolicy', 'Pendulum-v1', verbose=1) # Almost infinite number of timesteps, but the training will stop # early as soon as the max number of episodes is reached model.learn(int(1e10), callback=callback_max_episodes) diff --git a/docs/guide/custom_policy.rst b/docs/guide/custom_policy.rst index 94e11166b..d8d355645 100644 --- a/docs/guide/custom_policy.rst +++ b/docs/guide/custom_policy.rst @@ -401,5 +401,5 @@ you only need to specify ``net_arch=[256, 256]`` (here, two hidden layers of 256 # Custom critic architecture with two layers of 400 and 300 units policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[400, 300])) # Create the agent - model = SAC("MlpPolicy", "pendulum-v1", policy_kwargs=policy_kwargs, verbose=1) + model = SAC("MlpPolicy", "Pendulum-v1", policy_kwargs=policy_kwargs, verbose=1) model.learn(5000) diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst index 44eb4bb6d..0c439f369 100644 --- a/docs/guide/examples.rst +++ b/docs/guide/examples.rst @@ -536,7 +536,7 @@ Behind the scene, SB3 uses an :ref:`EvalCallback `. # Create the model, the training environment # and the test environment (for evaluation) - model = SAC('MlpPolicy', 'pendulum-v1', verbose=1, + model = SAC('MlpPolicy', 'Pendulum-v1', verbose=1, learning_rate=1e-3, create_eval_env=True) # Evaluate the model every 1000 steps on 5 test episodes diff --git a/docs/guide/export.rst b/docs/guide/export.rst index a32c85c79..e9c673623 100644 --- a/docs/guide/export.rst +++ b/docs/guide/export.rst @@ -62,7 +62,7 @@ For PPO, assuming a shared feature extactor. action_hidden, value_hidden = self.extractor(observation) return self.action_net(action_hidden), self.value_net(value_hidden) - # Example: model = PPO("MlpPolicy", "pendulum-v1") + # Example: model = PPO("MlpPolicy", "Pendulum-v1") model = PPO.load("PathToTrainedModel.zip") model.policy.to("cpu") onnxable_model = OnnxablePolicy(model.policy.mlp_extractor, model.policy.action_net, model.policy.value_net) diff --git a/docs/guide/tensorboard.rst b/docs/guide/tensorboard.rst index 7348fd45d..34f02a7df 100644 --- a/docs/guide/tensorboard.rst +++ b/docs/guide/tensorboard.rst @@ -61,7 +61,7 @@ Here is a simple example on how to log both additional tensor or arbitrary scala from stable_baselines3 import SAC from stable_baselines3.common.callbacks import BaseCallback - model = SAC("MlpPolicy", "pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) + model = SAC("MlpPolicy", "Pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) class TensorboardCallback(BaseCallback): @@ -104,7 +104,7 @@ Here is an example of how to render an image to TensorBoard at regular intervals from stable_baselines3.common.callbacks import BaseCallback from stable_baselines3.common.logger import Image - model = SAC("MlpPolicy", "pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) + model = SAC("MlpPolicy", "Pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) class ImageRecorderCallback(BaseCallback): @@ -141,7 +141,7 @@ Here is an example of how to store a plot in TensorBoard at regular intervals: from stable_baselines3.common.callbacks import BaseCallback from stable_baselines3.common.logger import Figure - model = SAC("MlpPolicy", "pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) + model = SAC("MlpPolicy", "Pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) class FigureRecorderCallback(BaseCallback): @@ -251,7 +251,7 @@ can get direct access to the underlying SummaryWriter in a callback: - model = SAC("MlpPolicy", "pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) + model = SAC("MlpPolicy", "Pendulum-v1", tensorboard_log="/tmp/sac/", verbose=1) class SummaryWriterCallback(BaseCallback): diff --git a/docs/modules/ddpg.rst b/docs/modules/ddpg.rst index 059da5f05..ace8f991b 100644 --- a/docs/modules/ddpg.rst +++ b/docs/modules/ddpg.rst @@ -67,7 +67,7 @@ This example is only to demonstrate the use of the library and its functions, an from stable_baselines3 import DDPG from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise - env = gym.make("pendulum-v1") + env = gym.make("Pendulum-v1") # The noise objects for DDPG n_actions = env.action_space.shape[-1] diff --git a/docs/modules/sac.rst b/docs/modules/sac.rst index 48a90783c..eb7dd4d7d 100644 --- a/docs/modules/sac.rst +++ b/docs/modules/sac.rst @@ -73,7 +73,7 @@ This example is only to demonstrate the use of the library and its functions, an from stable_baselines3 import SAC - env = gym.make("pendulum-v1") + env = gym.make("Pendulum-v1") model = SAC("MlpPolicy", env, verbose=1) model.learn(total_timesteps=10000, log_interval=4) diff --git a/docs/modules/td3.rst b/docs/modules/td3.rst index 03aafe726..62e36a13b 100644 --- a/docs/modules/td3.rst +++ b/docs/modules/td3.rst @@ -67,7 +67,7 @@ This example is only to demonstrate the use of the library and its functions, an from stable_baselines3 import TD3 from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise - env = gym.make("pendulum-v1") + env = gym.make("Pendulum-v1") # The noise objects for TD3 n_actions = env.action_space.shape[-1] diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py index 34f8cd660..e1f6d3869 100644 --- a/tests/test_callbacks.py +++ b/tests/test_callbacks.py @@ -75,7 +75,7 @@ def test_callbacks(tmp_path, model_class): if model_class in [A2C, PPO]: max_episodes = 1 n_envs = 2 - # pendulum-v1 has a timelimit of 200 timesteps + # Pendulum-v1 has a timelimit of 200 timesteps max_episode_length = 200 envs = make_vec_env(env_name, n_envs=n_envs, seed=0) @@ -99,7 +99,7 @@ def select_env(model_class) -> str: if model_class is DQN: return "CartPole-v0" else: - return "pendulum-v1" + return "Pendulum-v1" def test_eval_callback_vec_env(): diff --git a/tests/test_custom_policy.py b/tests/test_custom_policy.py index 5221102c5..e2d98fbd5 100644 --- a/tests/test_custom_policy.py +++ b/tests/test_custom_policy.py @@ -25,7 +25,7 @@ def test_flexible_mlp(model_class, net_arch): @pytest.mark.parametrize("net_arch", [[], [4], [4, 4], dict(qf=[8], pi=[8, 4])]) @pytest.mark.parametrize("model_class", [SAC, TD3]) def test_custom_offpolicy(model_class, net_arch): - _ = model_class("MlpPolicy", "pendulum-v1", policy_kwargs=dict(net_arch=net_arch), learning_starts=100).learn(300) + _ = model_class("MlpPolicy", "Pendulum-v1", policy_kwargs=dict(net_arch=net_arch), learning_starts=100).learn(300) @pytest.mark.parametrize("model_class", [A2C, PPO, SAC, TD3]) @@ -38,12 +38,12 @@ def test_custom_optimizer(model_class, optimizer_kwargs): kwargs = dict(n_steps=64) policy_kwargs = dict(optimizer_class=th.optim.AdamW, optimizer_kwargs=optimizer_kwargs, net_arch=[32]) - _ = model_class("MlpPolicy", "pendulum-v1", policy_kwargs=policy_kwargs, **kwargs).learn(300) + _ = model_class("MlpPolicy", "Pendulum-v1", policy_kwargs=policy_kwargs, **kwargs).learn(300) def test_tf_like_rmsprop_optimizer(): policy_kwargs = dict(optimizer_class=RMSpropTFLike, net_arch=[32]) - _ = A2C("MlpPolicy", "pendulum-v1", policy_kwargs=policy_kwargs).learn(500) + _ = A2C("MlpPolicy", "Pendulum-v1", policy_kwargs=policy_kwargs).learn(500) def test_dqn_custom_policy(): diff --git a/tests/test_deterministic.py b/tests/test_deterministic.py index e3a4b240c..fd5ab32fc 100644 --- a/tests/test_deterministic.py +++ b/tests/test_deterministic.py @@ -14,7 +14,7 @@ def test_deterministic_training_common(algo): # Smaller network kwargs = {"policy_kwargs": dict(net_arch=[64])} if algo in [TD3, SAC]: - env_id = "pendulum-v1" + env_id = "Pendulum-v1" kwargs.update({"action_noise": NormalActionNoise(0.0, 0.1), "learning_starts": 100}) else: env_id = "CartPole-v1" diff --git a/tests/test_distributions.py b/tests/test_distributions.py index 8397107d5..3652b1850 100644 --- a/tests/test_distributions.py +++ b/tests/test_distributions.py @@ -43,7 +43,7 @@ def test_squashed_gaussian(model_class): """ Test run with squashed Gaussian (notably entropy computation) """ - model = model_class("MlpPolicy", "pendulum-v1", use_sde=True, n_steps=64, policy_kwargs=dict(squash_output=True)) + model = model_class("MlpPolicy", "Pendulum-v1", use_sde=True, n_steps=64, policy_kwargs=dict(squash_output=True)) model.learn(500) gaussian_mean = th.rand(N_SAMPLES, N_ACTIONS) @@ -57,10 +57,10 @@ def test_squashed_gaussian(model_class): @pytest.fixture() def dummy_model_distribution_obs_and_actions() -> Tuple[A2C, np.array, np.array]: """ - Fixture creating a pendulum-v1 gym env, an A2C model and sampling 10 random observations and actions from the env + Fixture creating a Pendulum-v1 gym env, an A2C model and sampling 10 random observations and actions from the env :return: A2C model, random observations, random actions """ - env = gym.make("pendulum-v1") + env = gym.make("Pendulum-v1") model = A2C("MlpPolicy", env, seed=23) random_obs = np.array([env.observation_space.sample() for _ in range(10)]) random_actions = np.array([env.action_space.sample() for _ in range(10)]) diff --git a/tests/test_envs.py b/tests/test_envs.py index b8e075689..d46ddf376 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -27,7 +27,7 @@ ] -@pytest.mark.parametrize("env_id", ["CartPole-v0", "pendulum-v1"]) +@pytest.mark.parametrize("env_id", ["CartPole-v0", "Pendulum-v1"]) def test_env(env_id): """ Check that environmnent integrated in Gym pass the test. @@ -38,9 +38,9 @@ def test_env(env_id): with pytest.warns(None) as record: check_env(env) - # pendulum-v1 will produce a warning because the action space is + # Pendulum-v1 will produce a warning because the action space is # in [-2, 2] and not [-1, 1] - if env_id == "pendulum-v1": + if env_id == "Pendulum-v1": assert len(record) == 1 else: # The other environments must pass without warning diff --git a/tests/test_predict.py b/tests/test_predict.py index 3b28b1caf..853f4d11d 100644 --- a/tests/test_predict.py +++ b/tests/test_predict.py @@ -43,7 +43,7 @@ def test_auto_wrap(model_class): if model_class is DQN: env_name = "CartPole-v0" else: - env_name = "pendulum-v1" + env_name = "Pendulum-v1" env = gym.make(env_name) eval_env = gym.make(env_name) model = model_class("MlpPolicy", env) @@ -51,7 +51,7 @@ def test_auto_wrap(model_class): @pytest.mark.parametrize("model_class", MODEL_LIST) -@pytest.mark.parametrize("env_id", ["pendulum-v1", "CartPole-v1"]) +@pytest.mark.parametrize("env_id", ["Pendulum-v1", "CartPole-v1"]) @pytest.mark.parametrize("device", ["cpu", "cuda", "auto"]) def test_predict(model_class, env_id, device): if device == "cuda" and not th.cuda.is_available(): diff --git a/tests/test_run.py b/tests/test_run.py index ae14adb4f..340f2717e 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -15,7 +15,7 @@ def test_deterministic_pg(model_class, action_noise): """ model = model_class( "MlpPolicy", - "pendulum-v1", + "Pendulum-v1", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, @@ -26,13 +26,13 @@ def test_deterministic_pg(model_class, action_noise): model.learn(total_timesteps=300, eval_freq=250) -@pytest.mark.parametrize("env_id", ["CartPole-v1", "pendulum-v1"]) +@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v1"]) def test_a2c(env_id): model = A2C("MlpPolicy", env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True) model.learn(total_timesteps=1000, eval_freq=500) -@pytest.mark.parametrize("env_id", ["CartPole-v1", "pendulum-v1"]) +@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v1"]) @pytest.mark.parametrize("clip_range_vf", [None, 0.2, -0.2]) def test_ppo(env_id, clip_range_vf): if clip_range_vf is not None and clip_range_vf < 0: @@ -65,7 +65,7 @@ def test_ppo(env_id, clip_range_vf): def test_sac(ent_coef): model = SAC( "MlpPolicy", - "pendulum-v1", + "Pendulum-v1", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, @@ -82,7 +82,7 @@ def test_n_critics(n_critics): # Test SAC with different number of critics, for TD3, n_critics=1 corresponds to DDPG model = SAC( "MlpPolicy", - "pendulum-v1", + "Pendulum-v1", policy_kwargs=dict(net_arch=[64, 64], n_critics=n_critics), learning_starts=100, buffer_size=10000, @@ -110,7 +110,7 @@ def test_train_freq(tmp_path, train_freq): model = SAC( "MlpPolicy", - "pendulum-v1", + "Pendulum-v1", policy_kwargs=dict(net_arch=[64, 64], n_critics=1), learning_starts=100, buffer_size=10000, @@ -131,7 +131,7 @@ def test_train_freq_fail(train_freq): with pytest.raises(ValueError): model = SAC( "MlpPolicy", - "pendulum-v1", + "Pendulum-v1", policy_kwargs=dict(net_arch=[64, 64], n_critics=1), learning_starts=100, buffer_size=10000, diff --git a/tests/test_save_load.py b/tests/test_save_load.py index 61ef3c571..c380e321b 100644 --- a/tests/test_save_load.py +++ b/tests/test_save_load.py @@ -238,7 +238,7 @@ def test_exclude_include_saved_params(tmp_path, model_class): def test_save_load_pytorch_var(tmp_path): - model = SAC("MlpPolicy", "pendulum-v1", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1)) + model = SAC("MlpPolicy", "Pendulum-v1", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1)) model.learn(200) save_path = str(tmp_path / "sac_pendulum") model.save(save_path) @@ -255,7 +255,7 @@ def test_save_load_pytorch_var(tmp_path): assert not th.allclose(log_ent_coef_before, log_ent_coef_after) # With a fixed entropy coef - model = SAC("MlpPolicy", "pendulum-v1", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1)) + model = SAC("MlpPolicy", "Pendulum-v1", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1)) model.learn(200) save_path = str(tmp_path / "sac_pendulum") model.save(save_path) diff --git a/tests/test_sde.py b/tests/test_sde.py index 6fda390ea..b81d2780c 100644 --- a/tests/test_sde.py +++ b/tests/test_sde.py @@ -65,7 +65,7 @@ def test_state_dependent_noise(model_class, use_expln): kwargs = {"learning_starts": 0} if model_class == SAC else {"n_steps": 64} model = model_class( "MlpPolicy", - "pendulum-v1", + "Pendulum-v1", use_sde=True, seed=None, create_eval_env=True, diff --git a/tests/test_spaces.py b/tests/test_spaces.py index 0e44f4256..875da9526 100644 --- a/tests/test_spaces.py +++ b/tests/test_spaces.py @@ -53,10 +53,10 @@ def test_identity_spaces(model_class, env): @pytest.mark.parametrize("model_class", [A2C, DDPG, DQN, PPO, SAC, TD3]) -@pytest.mark.parametrize("env", ["pendulum-v1", "CartPole-v1"]) +@pytest.mark.parametrize("env", ["Pendulum-v1", "CartPole-v1"]) def test_action_spaces(model_class, env): if model_class in [SAC, DDPG, TD3]: - supported_action_space = env == "pendulum-v1" + supported_action_space = env == "Pendulum-v1" elif model_class == DQN: supported_action_space = env == "CartPole-v1" elif model_class in [A2C, PPO]: diff --git a/tests/test_tensorboard.py b/tests/test_tensorboard.py index 2f9b68087..20f58b912 100644 --- a/tests/test_tensorboard.py +++ b/tests/test_tensorboard.py @@ -7,8 +7,8 @@ MODEL_DICT = { "a2c": (A2C, "CartPole-v1"), "ppo": (PPO, "CartPole-v1"), - "sac": (SAC, "pendulum-v1"), - "td3": (TD3, "pendulum-v1"), + "sac": (SAC, "Pendulum-v1"), + "td3": (TD3, "Pendulum-v1"), } N_STEPS = 100 diff --git a/tests/test_train_eval_mode.py b/tests/test_train_eval_mode.py index d26fe5661..1ea2efe67 100644 --- a/tests/test_train_eval_mode.py +++ b/tests/test_train_eval_mode.py @@ -172,7 +172,7 @@ def test_dqn_train_with_batch_norm(): def test_td3_train_with_batch_norm(): model = TD3( "MlpPolicy", - "pendulum-v1", + "Pendulum-v1", policy_kwargs=dict(net_arch=[16, 16], features_extractor_class=FlattenBatchNormDropoutExtractor), learning_starts=0, tau=0, # do not copy the target @@ -219,7 +219,7 @@ def test_td3_train_with_batch_norm(): def test_sac_train_with_batch_norm(): model = SAC( "MlpPolicy", - "pendulum-v1", + "Pendulum-v1", policy_kwargs=dict(net_arch=[16, 16], features_extractor_class=FlattenBatchNormDropoutExtractor), learning_starts=0, tau=0, # do not copy the target @@ -257,7 +257,7 @@ def test_sac_train_with_batch_norm(): @pytest.mark.parametrize("model_class", [A2C, PPO]) -@pytest.mark.parametrize("env_id", ["pendulum-v1", "CartPole-v1"]) +@pytest.mark.parametrize("env_id", ["Pendulum-v1", "CartPole-v1"]) def test_a2c_ppo_train_with_batch_norm(model_class, env_id): model = model_class( "MlpPolicy", @@ -281,7 +281,7 @@ def test_offpolicy_collect_rollout_batch_norm(model_class): if model_class in [DQN]: env_id = "CartPole-v1" else: - env_id = "pendulum-v1" + env_id = "Pendulum-v1" clone_helper = CLONE_HELPERS[model_class] @@ -308,7 +308,7 @@ def test_offpolicy_collect_rollout_batch_norm(model_class): @pytest.mark.parametrize("model_class", [A2C, PPO]) -@pytest.mark.parametrize("env_id", ["pendulum-v1", "CartPole-v1"]) +@pytest.mark.parametrize("env_id", ["Pendulum-v1", "CartPole-v1"]) def test_a2c_ppo_collect_rollouts_with_batch_norm(model_class, env_id): model = model_class( "MlpPolicy", @@ -332,7 +332,7 @@ def test_a2c_ppo_collect_rollouts_with_batch_norm(model_class, env_id): @pytest.mark.parametrize("model_class", MODEL_LIST) -@pytest.mark.parametrize("env_id", ["pendulum-v1", "CartPole-v1"]) +@pytest.mark.parametrize("env_id", ["Pendulum-v1", "CartPole-v1"]) def test_predict_with_dropout_batch_norm(model_class, env_id): if env_id == "CartPole-v1": if model_class in [SAC, TD3]: diff --git a/tests/test_utils.py b/tests/test_utils.py index d5fcf599a..9f16a5c7b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -96,7 +96,7 @@ def test_vec_env_monitor_kwargs(): def test_env_auto_monitor_wrap(): - env = gym.make("pendulum-v1") + env = gym.make("Pendulum-v1") model = A2C("MlpPolicy", env) assert model.env.env_is_wrapped(Monitor)[0] is True @@ -104,7 +104,7 @@ def test_env_auto_monitor_wrap(): model = A2C("MlpPolicy", env) assert model.env.env_is_wrapped(Monitor)[0] is True - model = A2C("MlpPolicy", "pendulum-v1") + model = A2C("MlpPolicy", "Pendulum-v1") assert model.env.env_is_wrapped(Monitor)[0] is True @@ -136,7 +136,7 @@ def test_custom_vec_env(tmp_path): def test_evaluate_policy(): - model = A2C("MlpPolicy", "pendulum-v1", seed=0) + model = A2C("MlpPolicy", "Pendulum-v1", seed=0) n_steps_per_episode, n_eval_episodes = 200, 2 model.n_callback_calls = 0 @@ -166,7 +166,7 @@ def dummy_callback(locals_, _globals): assert len(episode_rewards) == n_eval_episodes # Test that warning is given about no monitor - eval_env = gym.make("pendulum-v1") + eval_env = gym.make("Pendulum-v1") with pytest.warns(UserWarning): _ = evaluate_policy(model, eval_env, n_eval_episodes) @@ -355,7 +355,7 @@ def test_zip_strict(): def test_is_wrapped(): """Test that is_wrapped correctly detects wraps""" - env = gym.make("pendulum-v1") + env = gym.make("Pendulum-v1") env = gym.Wrapper(env) assert not is_wrapped(env, Monitor) monitor_env = Monitor(env) @@ -372,11 +372,11 @@ def test_ppo_warnings(): # Only 1 step: advantage normalization will return NaN with pytest.raises(AssertionError): - PPO("MlpPolicy", "pendulum-v1", n_steps=1) + PPO("MlpPolicy", "Pendulum-v1", n_steps=1) # Truncated mini-batch with pytest.warns(UserWarning): - PPO("MlpPolicy", "pendulum-v1", n_steps=6, batch_size=8) + PPO("MlpPolicy", "Pendulum-v1", n_steps=6, batch_size=8) def test_get_system_info(): diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py index ca6b2ca47..5e18c3a98 100644 --- a/tests/test_vec_normalize.py +++ b/tests/test_vec_normalize.py @@ -14,7 +14,7 @@ unwrap_vec_normalize, ) -ENV_ID = "pendulum-v1" +ENV_ID = "Pendulum-v1" class DummyRewardEnv(gym.Env): From f549fc819f9b03e940eed76075cba4f3fa47bfef Mon Sep 17 00:00:00 2001 From: J K Terry Date: Thu, 21 Oct 2021 09:46:19 -0400 Subject: [PATCH 10/25] add minimum version --- setup.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/setup.py b/setup.py index 393de02cf..e85eff4fc 100644 --- a/setup.py +++ b/setup.py @@ -134,6 +134,13 @@ long_description=long_description, long_description_content_type="text/markdown", version=__version__, + python_requires=">=3.7", + classifiers=[ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + ], ) # python setup.py sdist From 1db85d134a67ab6294e214aa5d08f3d84c797965 Mon Sep 17 00:00:00 2001 From: J K Terry Date: Thu, 21 Oct 2021 09:51:59 -0400 Subject: [PATCH 11/25] moved things in changelog to breaking changes --- docs/misc/changelog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 74e7af7dd..617cb009f 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -9,6 +9,7 @@ Release 1.2.1a4 (WIP) Breaking Changes: ^^^^^^^^^^^^^^^^^ +- Support for Python 3.6 was removed - ``sde_net_arch`` argument in policies is deprecated and will be removed in a future version. - ``_get_latent`` (``ActorCriticPolicy``) was removed - All logging keys now use underscores instead of spaces (@timokau). Concretely this changes: @@ -35,7 +36,6 @@ Bug Fixes: Deprecations: ^^^^^^^^^^^^^ -- Support for Python 3.6 was removed - Switched minimum Gym version to 0.21.0 Others: From d72cdf61c37b94aa601b6d57ed15680bb369185d Mon Sep 17 00:00:00 2001 From: J K Terry Date: Sat, 23 Oct 2021 17:07:31 -0400 Subject: [PATCH 12/25] partial v5 fix --- tests/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 9f16a5c7b..661b28e8a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -38,11 +38,11 @@ def test_make_vec_env(env_id, n_envs, vec_env_cls, wrapper_class): env.close() -@pytest.mark.parametrize("env_id", ["BreakoutNoFrameskip-v5"]) +@pytest.mark.parametrize("env_id", ["ALE/Breakout-v5"]) @pytest.mark.parametrize("n_envs", [1, 2]) @pytest.mark.parametrize("wrapper_kwargs", [None, dict(clip_reward=False, screen_size=60)]) def test_make_atari_env(env_id, n_envs, wrapper_kwargs): - env_id = "BreakoutNoFrameskip-v5" + env_id = "ALE/Breakout-v5" env = make_atari_env(env_id, n_envs, wrapper_kwargs=wrapper_kwargs, monitor_dir=None, seed=0) assert env.num_envs == n_envs From 55414c344b81660a6ab779f17261535a8d8c7c95 Mon Sep 17 00:00:00 2001 From: modanesh Date: Tue, 28 Dec 2021 13:42:55 +0330 Subject: [PATCH 13/25] env update to pass tests --- tests/test_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_run.py b/tests/test_run.py index 5e6a107fe..223776dfb 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -147,7 +147,7 @@ def test_train_freq_fail(train_freq): def test_offpolicy_multi_env(model_class): kwargs = {} if model_class in [SAC, TD3, DDPG]: - env_id = "Pendulum-v0" + env_id = "Pendulum-v1" policy_kwargs = dict(net_arch=[64], n_critics=1) # Check auto-conversion to VectorizedActionNoise kwargs = dict(action_noise=NormalActionNoise(np.zeros(1), 0.1 * np.ones(1))) From 319ce24d4528af1a7b87871d1c9e926e29b54d77 Mon Sep 17 00:00:00 2001 From: modanesh Date: Tue, 28 Dec 2021 14:52:35 +0330 Subject: [PATCH 14/25] mismatch env version fixed --- tests/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 3cb7e2fc3..9dc35e07f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -81,14 +81,14 @@ def test_vec_env_monitor_kwargs(): env = make_vec_env("MountainCarContinuous-v0", n_envs=1, seed=0, monitor_kwargs={"allow_early_resets": False}) assert env.get_attr("allow_early_resets")[0] is False - env = make_atari_env("BreakoutNoFrameskip-v5", n_envs=1, seed=0, monitor_kwargs={"allow_early_resets": False}) + env = make_atari_env("BreakoutNoFrameskip-v4", n_envs=1, seed=0, monitor_kwargs={"allow_early_resets": False}) assert env.get_attr("allow_early_resets")[0] is False env = make_vec_env("MountainCarContinuous-v0", n_envs=1, seed=0, monitor_kwargs={"allow_early_resets": True}) assert env.get_attr("allow_early_resets")[0] is True env = make_atari_env( - "BreakoutNoFrameskip-v5", + "BreakoutNoFrameskip-v4", n_envs=1, seed=0, monitor_kwargs={"allow_early_resets": True}, From 218bc1a5fc72d1173f8bc2e61b5c27dcf2409fd9 Mon Sep 17 00:00:00 2001 From: Carlos Luis Date: Sat, 22 Jan 2022 17:50:02 +0100 Subject: [PATCH 15/25] Fix tests after merge --- tests/test_deterministic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_deterministic.py b/tests/test_deterministic.py index 9b4baa73a..4c92d269f 100644 --- a/tests/test_deterministic.py +++ b/tests/test_deterministic.py @@ -13,9 +13,8 @@ def test_deterministic_training_common(algo): rewards = [[], []] # Smaller network kwargs = {"policy_kwargs": dict(net_arch=[64])} - env_id = "Pendulum-v0" + env_id = "Pendulum-v1" if algo in [TD3, SAC]: - env_id = "Pendulum-v1" kwargs.update({"action_noise": NormalActionNoise(0.0, 0.1), "learning_starts": 100, "train_freq": 4}) else: if algo == DQN: From e5f70126482fb566511fbd1ed6e99787a018572c Mon Sep 17 00:00:00 2001 From: Carlos Luis Date: Sat, 22 Jan 2022 17:57:33 +0100 Subject: [PATCH 16/25] Include autorom in setup.py --- .github/workflows/ci.yml | 3 --- setup.py | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7d565a325..b45ae3192 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,9 +32,6 @@ jobs: pip install .[extra,tests,docs] # Use headless version pip install opencv-python-headless - # Add Atari ROMs - pip install AutoROM - AutoROM -v - name: Build the doc run: | make doc diff --git a/setup.py b/setup.py index d6ac4bc9b..d727bc9c1 100644 --- a/setup.py +++ b/setup.py @@ -116,7 +116,8 @@ # For render "opencv-python", # For atari games, - "ale-py~=0.7", + "ale-py~=0.7.1", + "autorom[accept-rom-license]~=0.4.2", "pillow", # Tensorboard support "tensorboard>=2.2.0", From 7bde14c4de81c0912aaec9fa1f4cdfc590c8b7dc Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Thu, 3 Feb 2022 15:31:34 -0800 Subject: [PATCH 17/25] Blacken code --- stable_baselines3/common/distributions.py | 4 ++-- stable_baselines3/common/envs/bit_flipping_env.py | 8 ++++---- tests/test_sde.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/stable_baselines3/common/distributions.py b/stable_baselines3/common/distributions.py index ca3f0b338..1c0e54a88 100644 --- a/stable_baselines3/common/distributions.py +++ b/stable_baselines3/common/distributions.py @@ -222,7 +222,7 @@ def log_prob(self, actions: th.Tensor, gaussian_actions: Optional[th.Tensor] = N log_prob = super(SquashedDiagGaussianDistribution, self).log_prob(gaussian_actions) # Squash correction (from original SAC implementation) # this comes from the fact that tanh is bijective and differentiable - log_prob -= th.sum(th.log(1 - actions ** 2 + self.epsilon), dim=1) + log_prob -= th.sum(th.log(1 - actions**2 + self.epsilon), dim=1) return log_prob def entropy(self) -> Optional[th.Tensor]: @@ -531,7 +531,7 @@ def proba_distribution( """ # Stop gradient if we don't want to influence the features self._latent_sde = latent_sde if self.learn_features else latent_sde.detach() - variance = th.mm(self._latent_sde ** 2, self.get_std(log_std) ** 2) + variance = th.mm(self._latent_sde**2, self.get_std(log_std) ** 2) self.distribution = Normal(mean_actions, th.sqrt(variance + self.epsilon)) return self diff --git a/stable_baselines3/common/envs/bit_flipping_env.py b/stable_baselines3/common/envs/bit_flipping_env.py index f5c2fb4d3..c5d713aa2 100644 --- a/stable_baselines3/common/envs/bit_flipping_env.py +++ b/stable_baselines3/common/envs/bit_flipping_env.py @@ -46,9 +46,9 @@ def __init__( # representation of the observation self.observation_space = spaces.Dict( { - "observation": spaces.Discrete(2 ** n_bits), - "achieved_goal": spaces.Discrete(2 ** n_bits), - "desired_goal": spaces.Discrete(2 ** n_bits), + "observation": spaces.Discrete(2**n_bits), + "achieved_goal": spaces.Discrete(2**n_bits), + "desired_goal": spaces.Discrete(2**n_bits), } ) elif image_obs_space: @@ -115,7 +115,7 @@ def convert_if_needed(self, state: np.ndarray) -> Union[int, np.ndarray]: if self.discrete_obs_space: # The internal state is the binary representation of the # observed one - return int(sum([state[i] * 2 ** i for i in range(len(state))])) + return int(sum([state[i] * 2**i for i in range(len(state))])) if self.image_obs_space: size = np.prod(self.image_shape) diff --git a/tests/test_sde.py b/tests/test_sde.py index e20b01d74..17ac1501d 100644 --- a/tests/test_sde.py +++ b/tests/test_sde.py @@ -26,7 +26,7 @@ def test_state_dependent_exploration_grad(): action = mu + noise - variance = th.mm(state ** 2, sigma_hat ** 2) + variance = th.mm(state**2, sigma_hat**2) action_dist = Normal(mu, th.sqrt(variance)) # Sum over the action dimension because we assume they are independent @@ -44,7 +44,7 @@ def test_state_dependent_exploration_grad(): for i in range(state_dim): # Derivative of the log probability of the jth component of the action # w.r.t. the standard deviation sigma_j - d_log_policy_j = (noise[:, j] ** 2 - sigma_j ** 2) / sigma_j ** 3 + d_log_policy_j = (noise[:, j] ** 2 - sigma_j**2) / sigma_j**3 # Derivative of sigma_j w.r.t. sigma_hat_ij d_log_sigma_j = (state[:, i] ** 2 * sigma_hat[i, j]) / sigma_j # Chain rule, average over the minibatch From f6414e7dd10f5046374123eaf11a070fff6a4749 Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Thu, 3 Feb 2022 15:54:49 -0800 Subject: [PATCH 18/25] Fix dtype issue in more robust way --- tests/test_env_checker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_env_checker.py b/tests/test_env_checker.py index 776e774f7..0b0a82d8f 100644 --- a/tests/test_env_checker.py +++ b/tests/test_env_checker.py @@ -8,17 +8,17 @@ class ActionDictTestEnv(gym.Env): action_space = Dict({"position": Discrete(1), "velocity": Discrete(1)}) - observation_space = Box(low=-1.0, high=2.0, shape=(3,), dtype=np.float64) + observation_space = Box(low=-1.0, high=2.0, shape=(3,), dtype=np.float32) def step(self, action): - observation = np.array([1.0, 1.5, 0.5]) + observation = np.array([1.0, 1.5, 0.5], dtype=self.observation_space.dtype) reward = 1 done = True info = {} return observation, reward, done, info def reset(self): - return np.array([1.0, 1.5, 0.5]) + return np.array([1.0, 1.5, 0.5], dtype=self.observation_space.dtype) def render(self, mode="human"): pass From f4b334248c73cba4609c51b8cdd6dc782423dd4e Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Thu, 3 Feb 2022 16:05:37 -0800 Subject: [PATCH 19/25] Fix GitLab CI: switch to Docker container with new black version --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 73f0134c9..63f9eafa0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,4 +1,4 @@ -image: stablebaselines/stable-baselines3-cpu:1.3.1a3 +image: stablebaselines/stable-baselines3-cpu:1.4.1a0 type-check: script: From 7f1e99e7a45dab97a8006dabc4f04fad6ad78da8 Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Thu, 3 Feb 2022 16:11:41 -0800 Subject: [PATCH 20/25] Remove workaround from GitLab. (May need to rebuild Docker for this though.) --- .gitlab-ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 73f0134c9..c54317406 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -7,8 +7,6 @@ type-check: pytest: script: - python --version - # Fix to get atari ROMs - - pip install atari-py==0.2.5 # MKL_THREADING_LAYER=GNU to avoid MKL_THREADING_LAYER=INTEL incompatibility error - MKL_THREADING_LAYER=GNU make pytest From ea073aef69bf5b4334dee1455258839d775d3c8e Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Fri, 4 Feb 2022 14:27:53 -0800 Subject: [PATCH 21/25] Revert to v4 --- docs/guide/custom_policy.rst | 2 +- docs/guide/examples.rst | 2 +- docs/modules/dqn.rst | 2 +- tests/test_utils.py | 3 +-- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/guide/custom_policy.rst b/docs/guide/custom_policy.rst index 2b734937e..1b8f9fb7f 100644 --- a/docs/guide/custom_policy.rst +++ b/docs/guide/custom_policy.rst @@ -145,7 +145,7 @@ that derives from ``BaseFeaturesExtractor`` and then pass it to the model when t features_extractor_class=CustomCNN, features_extractor_kwargs=dict(features_dim=128), ) - model = PPO("CnnPolicy", "BreakoutNoFrameskip-v5", policy_kwargs=policy_kwargs, verbose=1) + model = PPO("CnnPolicy", "BreakoutNoFrameskip-v4", policy_kwargs=policy_kwargs, verbose=1) model.learn(1000) diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst index b42eb340f..08d9caae7 100644 --- a/docs/guide/examples.rst +++ b/docs/guide/examples.rst @@ -336,7 +336,7 @@ and multiprocessing for you. To install the Atari environments, run the command` # There already exists an environment generator # that will make and wrap atari environments correctly. # Here we are also multi-worker training (n_envs=4 => 4 environments) - env = make_atari_env('PongNoFrameskip-v5', n_envs=4, seed=0) + env = make_atari_env('PongNoFrameskip-v4', n_envs=4, seed=0) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) diff --git a/docs/modules/dqn.rst b/docs/modules/dqn.rst index 68925d6c8..ce4385502 100644 --- a/docs/modules/dqn.rst +++ b/docs/modules/dqn.rst @@ -99,7 +99,7 @@ Clone the `rl-zoo repo `_: cd rl-baselines3-zoo/ -Run the benchmark (replace ``$ENV_ID`` by the env id, for instance ``BreakoutNoFrameskip-v5``): +Run the benchmark (replace ``$ENV_ID`` by the env id, for instance ``BreakoutNoFrameskip-v4``): .. code-block:: bash diff --git a/tests/test_utils.py b/tests/test_utils.py index 9dc35e07f..b07bbe931 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -39,11 +39,10 @@ def test_make_vec_env(env_id, n_envs, vec_env_cls, wrapper_class): env.close() -@pytest.mark.parametrize("env_id", ["ALE/Breakout-v5"]) +@pytest.mark.parametrize("env_id", ["BreakoutNoFrameskip-v4"]) @pytest.mark.parametrize("n_envs", [1, 2]) @pytest.mark.parametrize("wrapper_kwargs", [None, dict(clip_reward=False, screen_size=60)]) def test_make_atari_env(env_id, n_envs, wrapper_kwargs): - env_id = "ALE/Breakout-v5" env = make_atari_env(env_id, n_envs, wrapper_kwargs=wrapper_kwargs, monitor_dir=None, seed=0) assert env.num_envs == n_envs From 8f7d26bb17d4bb9ec2b1a431815f9f3c43a1beba Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Fri, 4 Feb 2022 14:28:04 -0800 Subject: [PATCH 22/25] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d727bc9c1..2b0cdb0b0 100644 --- a/setup.py +++ b/setup.py @@ -116,7 +116,7 @@ # For render "opencv-python", # For atari games, - "ale-py~=0.7.1", + "gym[atari,accept-rom-license]>=0.21.0", "autorom[accept-rom-license]~=0.4.2", "pillow", # Tensorboard support From edb504a3c2673afbb264ba7e2a7be8b0eb0798c4 Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Fri, 4 Feb 2022 14:29:22 -0800 Subject: [PATCH 23/25] Apply suggestions from code review --- docs/guide/examples.rst | 2 +- docs/misc/changelog.rst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst index 08d9caae7..a5b56b249 100644 --- a/docs/guide/examples.rst +++ b/docs/guide/examples.rst @@ -321,7 +321,7 @@ Atari Games Training a RL agent on Atari games is straightforward thanks to ``make_atari_env`` helper function. It will do `all the preprocessing `_ -and multiprocessing for you. To install the Atari environments, run the command``pip install gym[atari, accept-rom-license]`` to install the Atari environments and ROMs. +and multiprocessing for you. To install the Atari environments, run the command ``pip install gym[atari, accept-rom-license]`` to install the Atari environments and ROMs, or install Stable Baselines3 with ``pip install stable-baselines3[extra]`` to install this and other optional dependencies. .. image:: ../_static/img/colab-badge.svg :target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/atari_games.ipynb diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 9a0728208..6c636d3e3 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -118,7 +118,7 @@ Release 1.3.0 (2021-10-23) Breaking Changes: ^^^^^^^^^^^^^^^^^ -- Support for Python 3.6 was removed +- Support for Python 3.6 was removed. - ``sde_net_arch`` argument in policies is deprecated and will be removed in a future version. - ``_get_latent`` (``ActorCriticPolicy``) was removed - All logging keys now use underscores instead of spaces (@timokau). Concretely this changes: @@ -146,7 +146,7 @@ Bug Fixes: Deprecations: ^^^^^^^^^^^^^ -- Switched minimum Gym version to 0.21.0 +- Switched minimum Gym version to 0.21.0. Others: ^^^^^^^ From f34ea24f28ba8b1b13e6bb68460e97090457f454 Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Fri, 4 Feb 2022 14:31:33 -0800 Subject: [PATCH 24/25] Remove unnecessary autorom --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 2b0cdb0b0..e3dd72e12 100644 --- a/setup.py +++ b/setup.py @@ -117,7 +117,6 @@ "opencv-python", # For atari games, "gym[atari,accept-rom-license]>=0.21.0", - "autorom[accept-rom-license]~=0.4.2", "pillow", # Tensorboard support "tensorboard>=2.2.0", From d7de342d870ec954dfd205f640c7b6c392deff39 Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Fri, 4 Feb 2022 14:33:06 -0800 Subject: [PATCH 25/25] Consistent gym versions --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index e3dd72e12..eabf30c66 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,7 @@ packages=[package for package in find_packages() if package.startswith("stable_baselines3")], package_data={"stable_baselines3": ["py.typed", "version.txt"]}, install_requires=[ - "gym>=0.21", + "gym>=0.21", # Remember to also update gym version in "extra" below when this changes "numpy", "torch>=1.8.1", # For saving models @@ -116,7 +116,7 @@ # For render "opencv-python", # For atari games, - "gym[atari,accept-rom-license]>=0.21.0", + "gym[atari,accept-rom-license]>=0.21", "pillow", # Tensorboard support "tensorboard>=2.2.0",