From 09ca7b6b19b2770f0b0d35246edd76b3aa40c7de Mon Sep 17 00:00:00 2001 From: Ariel Kwiatkowski Date: Tue, 4 Jul 2023 05:21:02 +0200 Subject: [PATCH 1/4] Pin SB3 version to 1.7.0 (#738) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 73ffd00ac..fa1d03f31 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ "autorom[accept-rom-license]~=0.6.0", ] PYTYPE = ["pytype==2022.7.26"] if IS_NOT_WINDOWS else [] -STABLE_BASELINES3 = "stable-baselines3>=1.7.0" +STABLE_BASELINES3 = "stable-baselines3>=1.7.0,<2.0.0" # pinned to 0.21 until https://github.com/DLR-RM/stable-baselines3/pull/780 goes # upstream. GYM_VERSION_SPECIFIER = "==0.21.0" From 35e9d3bf07a491bbcbe90a1f256a0a21958d1f5d Mon Sep 17 00:00:00 2001 From: Ariel Kwiatkowski Date: Wed, 5 Jul 2023 03:56:31 +0200 Subject: [PATCH 2/4] Update conftest.py (#742) --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 10630fe45..6f278499e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -45,4 +45,4 @@ def custom_logger(tmpdir: str) -> logger.HierarchicalLogger: @pytest.fixture() def rng() -> np.random.Generator: - return np.random.default_rng() + return np.random.default_rng(seed=0) From 90b6aa32c87c991743edc65623b867c7c4b1bba1 Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Tue, 4 Jul 2023 19:30:14 -0700 Subject: [PATCH 3/4] Custom environment tutorial (#746) * Custom environment tutorial draft * Update the docs website * Clean notebook * Text clarification and new environment * Decrease training duration to hopefully make CI happy * Clarify that BC itself does not learn rewards --------- Co-authored-by: Ariel Kwiatkowski --- docs/index.rst | 1 + docs/tutorials/8_train_custom_env.ipynb | 366 ++++++++++++++++++++++++ 2 files changed, 367 insertions(+) create mode 100644 docs/tutorials/8_train_custom_env.ipynb diff --git a/docs/index.rst b/docs/index.rst index f3ec53b03..618f9d038 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -74,6 +74,7 @@ If you use ``imitation`` in your research project, please cite our paper to help tutorials/5a_train_preference_comparisons_with_cnn tutorials/6_train_mce tutorials/7_train_density + tutorials/8_train_custom_env tutorials/trajectories .. toctree:: diff --git a/docs/tutorials/8_train_custom_env.ipynb b/docs/tutorials/8_train_custom_env.ipynb new file mode 100644 index 000000000..6c9e28726 --- /dev/null +++ b/docs/tutorials/8_train_custom_env.ipynb @@ -0,0 +1,366 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[download this notebook here](https://github.com/HumanCompatibleAI/imitation/blob/master/docs/tutorials/8_train_custom_env.ipynb)\n", + "# Train Behavior Cloning in a Custom Environment\n", + "\n", + "You can use `imitation` to train a policy (and, for many imitation learning algorithm, learn rewards) in a custom environment.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Define the environment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will use a simple ObservationMatching environment as an example. The premise is simple -- the agent receives a vector of observations, and must output a vector of actions that matches the observations as closely as possible.\n", + "\n", + "If you have your own environment that you'd like to use, you can replace the code below with your own environment. Make sure it complies with the standard Gym API, and that the observation and action spaces are specified correctly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import gym\n", + "\n", + "from gym.spaces import Box\n", + "from gym.utils import seeding\n", + "\n", + "\n", + "class ObservationMatchingEnv(gym.Env):\n", + " def __init__(self, num_options: int = 2):\n", + " self.num_options = num_options\n", + " self.observation_space = Box(0, 1, shape=(num_options,), dtype=np.float32)\n", + " self.action_space = Box(0, 1, shape=(num_options,), dtype=np.float32)\n", + " self.seed()\n", + "\n", + " def seed(self, seed=None):\n", + " self.np_random, seed = seeding.np_random(seed)\n", + " return [seed]\n", + "\n", + " def reset(self):\n", + " self.state = self.np_random.uniform(size=self.num_options)\n", + " return self.state\n", + "\n", + " def step(self, action):\n", + " reward = -np.abs(self.state - action).mean()\n", + " self.state = self.np_random.uniform(size=self.num_options)\n", + " return self.state, reward, False, {}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 2: create the environment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From here, we have two options:\n", + "- Add the environment to the gym registry, and use it with existing utilities (e.g. `make`)\n", + "- Use the environment directly\n", + "\n", + "You only need to execute the cells in step 2a, or step 2b to proceed.\n", + "\n", + "At the end of these steps, we want to have:\n", + "- `env`: a single environment that we can use for training an expert with SB3\n", + "- `venv`: a vectorized environment where each individual environment is wrapped in `RolloutInfoWrapper`, that we can use for collecting rollouts with `imitation`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2a (recommended): add the environment to the gym registry" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The standard approach is adding the environment to the gym registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gym.register(\n", + " id=\"custom/ObservationMatching-v0\",\n", + " entry_point=ObservationMatchingEnv, # This can also be the path to the class, e.g. `observation_matching:ObservationMatchingEnv`\n", + " max_episode_steps=500,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After registering, you can create an environment is `gym.make(env_id)` which automatically handles the `TimeLimit` wrapper.\n", + "\n", + "To create a vectorized env, you can use the `make_vec_env` helper function (Option A), or create it directly (Options B1 and B2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gym.wrappers import TimeLimit\n", + "from imitation.data import rollout\n", + "from imitation.data.wrappers import RolloutInfoWrapper\n", + "from imitation.util.util import make_vec_env\n", + "from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv\n", + "\n", + "# Create a single environment for training an expert with SB3\n", + "env = gym.make(\"custom/ObservationMatching-v0\")\n", + "\n", + "\n", + "# Create a vectorized environment for training with `imitation`\n", + "\n", + "# Option A: use the `make_vec_env` helper function - make sure to pass `post_wrappers=[lambda env, _: RolloutInfoWrapper(env)]`\n", + "venv = make_vec_env(\n", + " \"custom/ObservationMatching-v0\",\n", + " rng=np.random.default_rng(),\n", + " n_envs=4,\n", + " post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],\n", + ")\n", + "\n", + "\n", + "# Option B1: use a custom env creator, and create VecEnv directly\n", + "# def _make_env():\n", + "# \"\"\"Helper function to create a single environment. Put any logic here, but make sure to return a RolloutInfoWrapper.\"\"\"\n", + "# _env = gym.make(\"custom/ObservationMatching-v0\")\n", + "# _env = RolloutInfoWrapper(_env)\n", + "# return _env\n", + "#\n", + "# venv = DummyVecEnv([_make_env for _ in range(4)])\n", + "#\n", + "# # Option B2: we can also use a parallel VecEnv implementation\n", + "# venv = SubprocVecEnv([_make_env for _ in range(4)])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Step 2b: directly use the environment\n", + "\n", + "Alternatively, we can directly initialize the environment by instantiating the class we created earlier, and handle all the additional logic ourselves." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gym.wrappers import TimeLimit\n", + "from imitation.data import rollout\n", + "from imitation.data.wrappers import RolloutInfoWrapper\n", + "from stable_baselines3.common.vec_env import DummyVecEnv\n", + "import numpy as np\n", + "\n", + "# Create a single environment for training with SB3\n", + "env = ObservationMatchingEnv()\n", + "env = TimeLimit(env, max_episode_steps=500)\n", + "\n", + "# Create a vectorized environment for training with `imitation`\n", + "\n", + "\n", + "# Option A: use a helper function to create multiple environments\n", + "def _make_env():\n", + " \"\"\"Helper function to create a single environment. Put any logic here, but make sure to return a RolloutInfoWrapper.\"\"\"\n", + " _env = ObservationMatchingEnv()\n", + " _env = TimeLimit(_env, max_episode_steps=500)\n", + " _env = RolloutInfoWrapper(_env)\n", + " return _env\n", + "\n", + "\n", + "venv = DummyVecEnv([_make_env for _ in range(4)])\n", + "\n", + "\n", + "# Option B: use a single environment\n", + "# env = FixedHorizonCartPoleEnv()\n", + "# venv = DummyVecEnv([lambda: RolloutInfoWrapper(env)]) # Wrap a single environment -- only useful for simple testing like this\n", + "\n", + "# Option C: use multiple environments\n", + "# venv = DummyVecEnv([lambda: RolloutInfoWrapper(ObservationMatchingEnv()) for _ in range(4)]) # Wrap multiple environments" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now we're just about done! Whether you used step 2a or 2b, your environment should now be ready to use with SB3 and `imitation`.\n", + "\n", + "For the sake of completeness, we'll train a BC model, the same way as in the first tutorial, but with our custom environment.\n", + "\n", + "Keep in mind that while we're using BC in this tutorial, you can just as easily use any of the other algorithms with the environment prepared in this way." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from stable_baselines3 import PPO\n", + "from stable_baselines3.ppo import MlpPolicy\n", + "from stable_baselines3.common.evaluation import evaluate_policy\n", + "from gym.wrappers import TimeLimit\n", + "\n", + "expert = PPO(\n", + " policy=MlpPolicy,\n", + " env=env,\n", + " seed=0,\n", + " batch_size=64,\n", + " ent_coef=0.0,\n", + " learning_rate=0.0003,\n", + " n_epochs=10,\n", + " n_steps=64,\n", + ")\n", + "\n", + "reward, _ = evaluate_policy(expert, env, 10)\n", + "print(f\"Reward before training: {reward}\")\n", + "\n", + "\n", + "# Note: if you followed step 2a, i.e. registered the environment, you can use the environment name directly\n", + "\n", + "# expert = PPO(\n", + "# policy=MlpPolicy,\n", + "# env=\"custom/ObservationMatching-v0\",\n", + "# seed=0,\n", + "# batch_size=64,\n", + "# ent_coef=0.0,\n", + "# learning_rate=0.0003,\n", + "# n_epochs=10,\n", + "# n_steps=64,\n", + "# )\n", + "expert.learn(10_000) # Note: set to 100000 to train a proficient expert\n", + "\n", + "reward, _ = evaluate_policy(expert, env, 10)\n", + "print(f\"Expert reward: {reward}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rng = np.random.default_rng()\n", + "rollouts = rollout.rollout(\n", + " expert,\n", + " venv,\n", + " rollout.make_sample_until(min_timesteps=None, min_episodes=50),\n", + " rng=rng,\n", + ")\n", + "transitions = rollout.flatten_trajectories(rollouts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from imitation.algorithms import bc\n", + "\n", + "bc_trainer = bc.BC(\n", + " observation_space=env.observation_space,\n", + " action_space=env.action_space,\n", + " demonstrations=transitions,\n", + " rng=rng,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As before, the untrained policy only gets poor rewards:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reward_before_training, _ = evaluate_policy(bc_trainer.policy, env, 10)\n", + "print(f\"Reward before training: {reward_before_training}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After training, we can get much closer to the expert's performance:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bc_trainer.train(n_epochs=1)\n", + "reward_after_training, _ = evaluate_policy(bc_trainer.policy, env, 10)\n", + "print(f\"Reward after training: {reward_after_training}\")" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "bd378ce8f53beae712f05342da42c6a7612fc68b19bea03b52c7b1cdc8851b5f" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 688e16357f5d67950185f816ee49d65acbc7c77f Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Wed, 5 Jul 2023 09:26:37 -0700 Subject: [PATCH 4/4] Tutorial on comparing algorithm performance (#747) * Add a new tutorial * Update index.rst * Improvements to the tutorial * Some more caution words * Fix typos --------- Co-authored-by: Ariel Kwiatkowski --- docs/index.rst | 1 + docs/tutorials/9_compare_baselines.ipynb | 481 +++++++++++++++++++++++ 2 files changed, 482 insertions(+) create mode 100644 docs/tutorials/9_compare_baselines.ipynb diff --git a/docs/index.rst b/docs/index.rst index 618f9d038..0c516c58b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -75,6 +75,7 @@ If you use ``imitation`` in your research project, please cite our paper to help tutorials/6_train_mce tutorials/7_train_density tutorials/8_train_custom_env + tutorials/9_compare_baselines tutorials/trajectories .. toctree:: diff --git a/docs/tutorials/9_compare_baselines.ipynb b/docs/tutorials/9_compare_baselines.ipynb new file mode 100644 index 000000000..c9bc0481a --- /dev/null +++ b/docs/tutorials/9_compare_baselines.ipynb @@ -0,0 +1,481 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[download this notebook here](https://github.com/HumanCompatibleAI/imitation/blob/master/docs/tutorials/9_compare_baselines.ipynb)\n", + "# Reliably compare algorithm performance\n", + "\n", + "Did we actually match the expert performance or was it just luck? Did this hyperparameter change actually improve the performance of our algorithm? These are questions that we need to answer when we want to compare the performance of different algorithms or hyperparameters.\n", + "\n", + "`imitation` provides some tools to help you answer these questions. For demonstration purposes, we will use Behavior Cloning on the CartPole-v1 environment. We will compare different variants of the trained algorithm, and also compare it with a more sophisticated algorithm, DAgger." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As in the first tutorial, we will start by training an expert." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gym\n", + "from stable_baselines3 import PPO\n", + "from stable_baselines3.ppo import MlpPolicy\n", + "\n", + "env = gym.make(\"CartPole-v1\")\n", + "expert = PPO(\n", + " policy=MlpPolicy,\n", + " env=env,\n", + " seed=0,\n", + " batch_size=64,\n", + " ent_coef=0.0,\n", + " learning_rate=0.0003,\n", + " n_epochs=10,\n", + " n_steps=64,\n", + ")\n", + "expert.learn(10_000) # set to 100_000 for better performance" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For comparison, let's also train a not-quite-expert." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "not_expert = PPO(\n", + " policy=MlpPolicy,\n", + " env=env,\n", + " seed=0,\n", + " batch_size=64,\n", + " ent_coef=0.0,\n", + " learning_rate=0.0003,\n", + " n_epochs=10,\n", + " n_steps=64,\n", + ")\n", + "\n", + "not_expert.learn(1_000) # set to 10_000 for slightly better performance" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So are they any good? Let's quickly get a point estimate of their performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from stable_baselines3.common.evaluation import evaluate_policy\n", + "\n", + "env.seed(0)\n", + "\n", + "expert_reward, _ = evaluate_policy(expert, env, 1)\n", + "not_expert_reward, _ = evaluate_policy(not_expert, env, 1)\n", + "\n", + "print(f\"Expert reward: {expert_reward:.2f}\")\n", + "print(f\"Not expert reward: {not_expert_reward:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But wait! We only ran the evaluation once. What if we got lucky? Let's run the evaluation a few more times and see what happens." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "expert_reward, _ = evaluate_policy(expert, env, 10)\n", + "not_expert_reward, _ = evaluate_policy(not_expert, env, 10)\n", + "\n", + "print(f\"Expert reward: {expert_reward:.2f}\")\n", + "print(f\"Not expert reward: {not_expert_reward:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Seems a bit more robust now, but how certain are we? Fortunately, `imitation` provides us with tools to answer this.\n", + "\n", + "We will perform a permutation test using the `is_significant_reward_improvement` function. We want to be very certain -- let's set the bar high and require a p-value of 0.001." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from imitation.testing.reward_improvement import is_significant_reward_improvement\n", + "\n", + "expert_rewards, _ = evaluate_policy(expert, env, 10, return_episode_rewards=True)\n", + "not_expert_rewards, _ = evaluate_policy(\n", + " not_expert, env, 10, return_episode_rewards=True\n", + ")\n", + "\n", + "significant = is_significant_reward_improvement(\n", + " not_expert_rewards, expert_rewards, 0.001\n", + ")\n", + "\n", + "print(\n", + " f\"The expert is {'NOT ' if not significant else ''}significantly better than the not-expert.\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Huh, turns out we set the bar too high. We could lower our standards, but that's for cowards.\n", + "Instead, we can collect more data and try again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from imitation.testing.reward_improvement import is_significant_reward_improvement\n", + "\n", + "expert_rewards, _ = evaluate_policy(expert, env, 100, return_episode_rewards=True)\n", + "not_expert_rewards, _ = evaluate_policy(\n", + " not_expert, env, 100, return_episode_rewards=True\n", + ")\n", + "\n", + "significant = is_significant_reward_improvement(\n", + " not_expert_rewards, expert_rewards, 0.001\n", + ")\n", + "\n", + "print(\n", + " f\"The expert is {'NOT ' if not significant else ''}significantly better than the not-expert.\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we go! We can now be 99.9% confident that the expert is better than the not-expert -- in this specific case, with these specific trained models. It might still be an extraordinary stroke of luck, or a conspiracy to make us choose the wrong algorithm, but outside of that, we can be pretty sure our data's correct." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use the same principle to with imitation learning algorithms. Let's train a behavior cloning algorithm and see how it compares to the expert. This time, we can lower the bar to the standard \"scientific\" threshold of 0.05." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Like in the first tutorial, we will start by collecting some expert data. But to spice it up, let's also get some data from the not-quite-expert." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from imitation.data import rollout\n", + "from imitation.data.wrappers import RolloutInfoWrapper\n", + "from stable_baselines3.common.vec_env import DummyVecEnv\n", + "import numpy as np\n", + "\n", + "rng = np.random.default_rng()\n", + "expert_rollouts = rollout.rollout(\n", + " expert,\n", + " DummyVecEnv([lambda: RolloutInfoWrapper(env)]),\n", + " rollout.make_sample_until(min_timesteps=None, min_episodes=50),\n", + " rng=rng,\n", + ")\n", + "expert_transitions = rollout.flatten_trajectories(expert_rollouts)\n", + "\n", + "\n", + "not_expert_rollouts = rollout.rollout(\n", + " not_expert,\n", + " DummyVecEnv([lambda: RolloutInfoWrapper(env)]),\n", + " rollout.make_sample_until(min_timesteps=None, min_episodes=50),\n", + " rng=rng,\n", + ")\n", + "not_expert_transitions = rollout.flatten_trajectories(not_expert_rollouts)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try cloning an expert and a non-expert, and see how they compare." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from imitation.algorithms import bc\n", + "\n", + "expert_bc_trainer = bc.BC(\n", + " observation_space=env.observation_space,\n", + " action_space=env.action_space,\n", + " demonstrations=expert_transitions,\n", + " rng=rng,\n", + ")\n", + "\n", + "not_expert_bc_trainer = bc.BC(\n", + " observation_space=env.observation_space,\n", + " action_space=env.action_space,\n", + " demonstrations=not_expert_transitions,\n", + " rng=rng,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "expert_bc_trainer.train(n_epochs=2)\n", + "not_expert_bc_trainer.train(n_epochs=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bc_expert_rewards, _ = evaluate_policy(\n", + " expert_bc_trainer.policy, env, 10, return_episode_rewards=True\n", + ")\n", + "bc_not_expert_rewards, _ = evaluate_policy(\n", + " not_expert_bc_trainer.policy, env, 10, return_episode_rewards=True\n", + ")\n", + "significant = is_significant_reward_improvement(\n", + " bc_not_expert_rewards, bc_expert_rewards, 0.05\n", + ")\n", + "print(f\"Cloned expert rewards: {bc_expert_rewards}\")\n", + "print(f\"Cloned not-expert rewards: {bc_not_expert_rewards}\")\n", + "\n", + "print(\n", + " f\"Cloned expert is {'NOT ' if not significant else ''}significantly better than the cloned not-expert.\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How about comparing the expert clone to the expert itself?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bc_clone_rewards, _ = evaluate_policy(\n", + " expert_bc_trainer.policy, env, 10, return_episode_rewards=True\n", + ")\n", + "\n", + "expert_rewards, _ = evaluate_policy(expert, env, 10, return_episode_rewards=True)\n", + "\n", + "significant = is_significant_reward_improvement(bc_clone_rewards, expert_rewards, 0.05)\n", + "\n", + "print(f\"Cloned expert rewards: {bc_clone_rewards}\")\n", + "print(f\"Expert rewards: {expert_rewards}\")\n", + "\n", + "print(\n", + " f\"Expert is {'NOT ' if not significant else ''}significantly better than the cloned expert.\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Turns out the expert is significantly better than the clone -- again, in this case. Note, however, that this is not proof that the clone is as good as the expert -- there's a subtle difference between the two claims in the context of hypothesis testing.\n", + "\n", + "Note: if you changed the duration of the training at the beginning of this tutorial, you might get different results. While this might break the narrative in this tutorial, it's a good learning opportunity.\n", + "\n", + "When comparing the performance of two agents, algorithms, hyperparameter sets, always remember the scope of what you're testing. In this tutorial, we have one instance of an expert -- but RL training is famously unstable, so another training run with another random seed would likely produce a slightly different result. So ideally, we would like to repeat this procedure several times, training the same agent with different random seeds, and then compare the average performance of the two agents.\n", + "\n", + "Even then, this is just on one environment, with one algorithm. So be wary of generalizing your results too much." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also use the same method to compare different algorithms. While CartPole is pretty easy, we can make it more difficult by decreasing the number of episodes in our dataset, and generating them with a suboptimal policy:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rollouts = rollout.rollout(\n", + " expert,\n", + " DummyVecEnv([lambda: RolloutInfoWrapper(env)]),\n", + " rollout.make_sample_until(min_timesteps=None, min_episodes=1),\n", + " rng=rng,\n", + ")\n", + "transitions = rollout.flatten_trajectories(rollouts)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try training a behavior cloning algorithm on this dataset.\n", + "\n", + "Note that for DAgger, we have to cheat a little bit -- it's allowed to use the expert policy to generate additional data.\n", + "For the purposes of this tutorial, we'll stick with this to avoid spending hours training an expert for a more complex environment.\n", + "\n", + "So while this little experiment isn't definitive proof that DAgger is better than BC, you can use the same method to compare any two algorithms." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from imitation.algorithms.dagger import SimpleDAggerTrainer\n", + "import tempfile\n", + "\n", + "bc_trainer = bc.BC(\n", + " observation_space=env.observation_space,\n", + " action_space=env.action_space,\n", + " demonstrations=transitions,\n", + " rng=rng,\n", + ")\n", + "\n", + "bc_trainer.train(n_epochs=1)\n", + "\n", + "\n", + "with tempfile.TemporaryDirectory(prefix=\"dagger_example_\") as tmpdir:\n", + " print(tmpdir)\n", + " dagger_bc_trainer = bc.BC(\n", + " observation_space=env.observation_space,\n", + " action_space=env.action_space,\n", + " rng=np.random.default_rng(),\n", + " )\n", + " dagger_trainer = SimpleDAggerTrainer(\n", + " venv=DummyVecEnv([lambda: RolloutInfoWrapper(env)]),\n", + " scratch_dir=tmpdir,\n", + " expert_policy=expert,\n", + " bc_trainer=dagger_bc_trainer,\n", + " rng=np.random.default_rng(),\n", + " )\n", + "\n", + " dagger_trainer.train(5000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After training both BC and DAgger, let's compare their performances again! We expect DAgger to be better -- after all, it's a more advanced algorithm. But is it significantly better?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bc_rewards, _ = evaluate_policy(bc_trainer.policy, env, 10, return_episode_rewards=True)\n", + "dagger_rewards, _ = evaluate_policy(\n", + " dagger_trainer.policy, env, 10, return_episode_rewards=True\n", + ")\n", + "\n", + "significant = is_significant_reward_improvement(bc_rewards, dagger_rewards, 0.05)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"BC rewards: {bc_rewards}\")\n", + "print(f\"DAgger rewards: {dagger_rewards}\")\n", + "\n", + "print(\n", + " f\"Our DAgger agent is {'NOT ' if not significant else ''}significantly better than BC.\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you increased the number of training iterations for the expert (in the first cell of the tutorial), you should see that DAgger indeed performs better than BC. If you didn't, you likely see the opposite result. Yet another reason to be careful when interpreting results!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's take a moment, to remember the limitations of this experiment. We're comparing two algorithms on one environment, with one dataset. We're also using a suboptimal expert policy, which might not be the best choice for BC. If you want to convince yourself that DAgger is better than BC, you should pick out a more complex environment, you should run this experiment several times, with different random seeds and perform some hyperparameter optimization to make sure we're not just using unlucky hyperparameters. At the end, we would also need to run the same hypothesis test across average returns of several independent runs.\n", + "\n", + "But now you have all the pieces of the puzzle to do that!" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "bd378ce8f53beae712f05342da42c6a7612fc68b19bea03b52c7b1cdc8851b5f" + }, + "kernelspec": { + "display_name": "Python 3.8.10 64-bit ('venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}