From 87bca516e333fb50be953372141e7218a73c724a Mon Sep 17 00:00:00 2001 From: Juan Rocamonde Date: Sat, 27 Aug 2022 16:13:31 +0100 Subject: [PATCH 01/14] Replace imitation with seals --- examples/6_train_mce.ipynb | 111 +++++-- setup.py | 3 +- src/imitation/algorithms/mce_irl.py | 28 +- src/imitation/envs/__init__.py | 1 - src/imitation/envs/examples/__init__.py | 8 - src/imitation/envs/examples/model_envs.py | 371 ---------------------- src/imitation/envs/resettable_env.py | 310 ------------------ src/imitation/testing/envs.py | 8 +- tests/algorithms/test_mce_irl.py | 191 +++++------ tests/test_envs.py | 16 - 10 files changed, 204 insertions(+), 843 deletions(-) delete mode 100644 src/imitation/envs/__init__.py delete mode 100644 src/imitation/envs/examples/__init__.py delete mode 100644 src/imitation/envs/examples/model_envs.py delete mode 100644 src/imitation/envs/resettable_env.py diff --git a/examples/6_train_mce.ipynb b/examples/6_train_mce.ipynb index e3aeb87bc..d39ef6425 100644 --- a/examples/6_train_mce.ipynb +++ b/examples/6_train_mce.ipynb @@ -2,16 +2,23 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n", + "is_executing": true + } + }, "source": [ "# Learn a Reward Function using Maximum Conditional Entropy Inverse Reinforcement Learning\n", "\n", "Here, we're going to take a tabular environment with a pre-defined reward function, Cliffworld, and solve for the optimal policy. We then generate demonstrations from this policy, and use them to learn an approximation to the true reward function with MCE IRL. Finally, we directly compare the learned reward to the ground-truth reward (which we have access to in this example)." - ] + ], + "outputs": [] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + }, "source": [ "Cliffworld is a POMDP, and its \"observations\" consist of the (partial) observations proper and the (full) hidden environment state. We use `DictExtractWrapper` to extract only the hidden states from the environment, turning it into a fully observable MDP to make computing the optimal policy easy." ] @@ -19,20 +26,24 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "from functools import partial\n", + "from seals import base_envs as envs\n", + "from seals.diagnostics.imitation_examples import CliffWorld\n", + "\n", "from imitation.algorithms.mce_irl import (\n", " MCEIRL,\n", " mce_occupancy_measures,\n", " mce_partition_fh,\n", " TabularPolicy,\n", ")\n", - "\n", "from imitation.data import rollout\n", - "from imitation.envs import resettable_env\n", - "from imitation.envs.examples.model_envs import CliffWorld\n", "from stable_baselines3.common.vec_env import DummyVecEnv\n", "from imitation.rewards import reward_nets\n", "\n", @@ -40,13 +51,19 @@ "env_creator = partial(CliffWorld, height=4, horizon=8, width=7, use_xy_obs=True)\n", "env_single = env_creator()\n", "\n", + "state_env_creator = partial(envs.ExposePOMDPStateWrapper, env_single)\n", + "\n", "# This is just a vectorized environment because `generate_trajectories` expects one\n", - "state_venv = resettable_env.DictExtractWrapper(DummyVecEnv([env_creator] * 4), \"state\")" + "state_venv = DummyVecEnv([state_env_creator] * 4)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "Then we derive an expert policy using Bellman backups. We analytically compute the occupancy measures, and also sample some expert trajectories." ] @@ -54,7 +71,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "_, _, pi = mce_partition_fh(env_single)\n", @@ -62,7 +83,7 @@ "_, om = mce_occupancy_measures(env_single, pi=pi)\n", "\n", "expert = TabularPolicy(\n", - " state_space=env_single.pomdp_state_space,\n", + " state_space=env_single.state_space,\n", " action_space=env_single.action_space,\n", " pi=pi,\n", " rng=None,\n", @@ -79,7 +100,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "### Training the reward function\n", "\n", @@ -89,7 +114,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", @@ -98,7 +127,7 @@ "\n", "def train_mce_irl(demos, hidden_sizes, lr=0.01, **kwargs):\n", " reward_net = reward_nets.BasicRewardNet(\n", - " env_single.pomdp_observation_space,\n", + " env_single.observation_space,\n", " env_single.action_space,\n", " hid_sizes=hidden_sizes,\n", " use_action=False,\n", @@ -154,7 +183,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "As you can see, a linear reward model cannot fit the data. Even though we're training the model on analytically computed occupancy measures for the optimal policy, the resulting reward and occupancy frequencies diverge sharply." ] @@ -162,7 +195,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "train_mce_irl(om, hidden_sizes=[])" @@ -170,7 +207,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "Now, let's try using a very simple nonlinear reward model: an MLP with a single hidden layer. We first train it on the analytically computed occupancy measures. This should give a very precise result." ] @@ -178,7 +219,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "train_mce_irl(om, hidden_sizes=[256])" @@ -186,7 +231,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "Then we train it on trajectories sampled from the expert. This gives a stochastic approximation to occupancy measure, so performance is a little worse. Using more expert trajectories should improve performance -- try it!" ] @@ -194,7 +243,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "mce_irl_from_trajs = train_mce_irl(expert_trajs[0:10], hidden_sizes=[256])" @@ -202,7 +255,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "While the learned reward function is quite different from the true reward function, it induces a virtually identical occupancy measure over the states. In particular, states below the top row get almost the same reward as top-row states. This is because in Cliff World, there is an upward-blowing wind which will push the agent toward the top row with probability 0.3 at every timestep.\n", "\n", @@ -211,11 +268,8 @@ } ], "metadata": { - "interpreter": { - "hash": "439158cd89905785fcc749928062ade7bfccc3f087fab145e5671f895c635937" - }, "kernelspec": { - "display_name": "Python 3.9.13 ('base')", + "display_name": "Python 3.10.5 ('venv': venv)", "language": "python", "name": "python3" }, @@ -229,7 +283,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.5" + }, + "vscode": { + "interpreter": { + "hash": "32bdf725e05214cef1880b58867ef93d96040658820b28e8afccdd8b1b705058" + } } }, "nbformat": 4, diff --git a/setup.py b/setup.py index 81a095452..c067d2a30 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,6 @@ # working versions to make our CI/CD pipeline as stable as possible. TESTS_REQUIRE = ( [ - "seals==0.1.2", "black[jupyter]~=22.6.0", "coverage~=6.4.2", "codecov~=2.1.12", @@ -119,6 +118,8 @@ def run(self): "torch>=1.4.0", "tqdm", "scikit-learn>=0.21.2", + "seals@git+" + "https://github.com/HumanCompatibleAI/seals.git@imitation-envs-to-seals", STABLE_BASELINES3, # TODO(adam) switch to upstream release if they make it # See https://github.com/IDSIA/sacred/issues/879 diff --git a/src/imitation/algorithms/mce_irl.py b/src/imitation/algorithms/mce_irl.py index 2c7e29378..8e9412be7 100644 --- a/src/imitation/algorithms/mce_irl.py +++ b/src/imitation/algorithms/mce_irl.py @@ -14,17 +14,17 @@ import scipy.special import torch as th from stable_baselines3.common import policies +from seals import base_envs as envs from imitation.algorithms import base from imitation.data import rollout, types -from imitation.envs import resettable_env from imitation.rewards import reward_nets from imitation.util import logger as imit_logger from imitation.util import networks, util def mce_partition_fh( - env: resettable_env.TabularModelEnv, + env: envs.TabularModelPOMDP, *, reward: Optional[np.ndarray] = None, discount: float = 1.0, @@ -46,8 +46,8 @@ def mce_partition_fh( """ # shorthand horizon = env.horizon - n_states = env.n_states - n_actions = env.n_actions + n_states = env.state_dim + n_actions = env.action_dim T = env.transition_matrix if reward is None: reward = env.reward_matrix @@ -77,7 +77,7 @@ def mce_partition_fh( def mce_occupancy_measures( - env: resettable_env.TabularModelEnv, + env: envs.TabularModelPOMDP, *, reward: Optional[np.ndarray] = None, pi: Optional[np.ndarray] = None, @@ -102,8 +102,8 @@ def mce_occupancy_measures( """ # shorthand horizon = env.horizon - n_states = env.n_states - n_actions = env.n_actions + n_states = env.state_dim + n_actions = env.action_dim T = env.transition_matrix if reward is None: reward = env.reward_matrix @@ -252,7 +252,7 @@ class MCEIRL(base.DemonstrationAlgorithm[types.TransitionsMinimal]): def __init__( self, demonstrations: Optional[MCEDemonstrations], - env: resettable_env.TabularModelEnv, + env: envs.TabularModelPOMDP, reward_net: reward_nets.RewardNet, optimizer_cls: Type[th.optim.Optimizer] = th.optim.Adam, optimizer_kwargs: Optional[Mapping[str, Any]] = None, @@ -313,17 +313,17 @@ def __init__( # Initialize policy to be uniform random. We don't use this for MCE IRL # training, but it gives us something to return at all times with `policy` # property, similar to other algorithms. - ones = np.ones((self.env.horizon, self.env.n_states, self.env.n_actions)) - uniform_pi = ones / self.env.n_actions + ones = np.ones((self.env.horizon, self.env.state_dim, self.env.action_dim)) + uniform_pi = ones / self.env.action_dim self._policy = TabularPolicy( - state_space=self.env.pomdp_state_space, + state_space=self.env.state_space, action_space=self.env.action_space, pi=uniform_pi, rng=self.rng, ) def _set_demo_from_trajectories(self, trajs: Iterable[types.Trajectory]) -> None: - self.demo_state_om = np.zeros((self.env.n_states,)) + self.demo_state_om = np.zeros((self.env.state_dim,)) num_demos = 0 for traj in trajs: cum_discount = 1.0 @@ -339,7 +339,7 @@ def _set_demo_from_obs( dones: Optional[np.ndarray], next_obses: Optional[np.ndarray], ) -> None: - self.demo_state_om = np.zeros((self.env.n_states,)) + self.demo_state_om = np.zeros((self.env.state_dim,)) for obs in obses: if isinstance(obs, th.Tensor): @@ -360,7 +360,7 @@ def _set_demo_from_obs( else: warnings.warn( "Training MCEIRL with transitions that lack next observation." - "This will result in systematically wrong occupancy measure estimates.", + "This gwill result in systematically wrong occupancy measure estimates.", ) # Normalize occupancy measure estimates diff --git a/src/imitation/envs/__init__.py b/src/imitation/envs/__init__.py deleted file mode 100644 index d3a8bbd11..000000000 --- a/src/imitation/envs/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Environment base classes and demo environments.""" diff --git a/src/imitation/envs/examples/__init__.py b/src/imitation/envs/examples/__init__.py deleted file mode 100644 index 162d0d26a..000000000 --- a/src/imitation/envs/examples/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -"""Environments used for testing and benchmarking. - -These are not a core part of the imitation package. They are relatively lightly tested, -and may be changed without warning. -""" - -# Register environments with Gym -from imitation.envs.examples import model_envs # noqa: F401 diff --git a/src/imitation/envs/examples/model_envs.py b/src/imitation/envs/examples/model_envs.py deleted file mode 100644 index e5687b192..000000000 --- a/src/imitation/envs/examples/model_envs.py +++ /dev/null @@ -1,371 +0,0 @@ -"""Example discrete MDPs for use with tabular MCE IRL.""" - -from typing import Optional - -import gym -import numpy as np - -from imitation.envs.resettable_env import TabularModelEnv - - -def make_random_trans_mat( - n_states, - n_actions, - max_branch_factor, - rand_state=np.random, -) -> np.ndarray: - """Make a 'random' transition matrix. - - Each action goes to at least `max_branch_factor` other states from the - current state, with transition distribution sampled from Dirichlet(1,1,…,1). - - This roughly apes the strategy from some old Lisp code that Rich Sutton - left on the internet (http://incompleteideas.net/RandomMDPs.html), and is - therefore a legitimate way to generate MDPs. - - Args: - n_states: Number of states. - n_actions: Number of actions. - max_branch_factor: Maximum number of states that can be reached from - each state-action pair. - rand_state: NumPy random state. - - Returns: - The transition matrix `mat`, where `mat[s,a,next_s]` gives the probability - of transitioning to `next_s` after taking action `a` in state `s`. - """ - out_mat = np.zeros((n_states, n_actions, n_states), dtype="float32") - for start_state in range(n_states): - for action in range(n_actions): - # uniformly sample a number of successors in [1,max_branch_factor] - # for this action - succs = rand_state.randint(1, max_branch_factor + 1) - next_states = rand_state.choice(n_states, size=(succs,), replace=False) - # generate random vec in probability simplex - next_vec = rand_state.dirichlet(np.ones((succs,))) - next_vec = next_vec / np.sum(next_vec) - out_mat[start_state, action, next_states] = next_vec - return out_mat - - -def make_random_state_dist( - n_avail: int, - n_states: int, - rand_state: np.random.RandomState = np.random, -) -> np.ndarray: - """Make a random initial state distribution over n_states. - - Args: - n_avail: Number of states available to transition into. - n_states: Total number of states. - rand_state: NumPy random state. - - Returns: - An initial state distribution that is zero at all but a uniformly random - chosen subset of `n_avail` states. This subset of chosen states are set to a - sample from the uniform distribution over the (n_avail-1) simplex, aka the - flat Dirichlet distribution. - - Raises: - ValueError: If `n_avail` is not in the range `(0, n_states]`. - """ # noqa: DAR402 - assert 0 < n_avail <= n_states - init_dist = np.zeros((n_states,)) - next_states = rand_state.choice(n_states, size=(n_avail,), replace=False) - avail_state_dist = rand_state.dirichlet(np.ones((n_avail,))) - init_dist[next_states] = avail_state_dist - assert np.sum(init_dist > 0) == n_avail - init_dist = init_dist / np.sum(init_dist) - return init_dist - - -def make_obs_mat( - n_states: int, - is_random: bool, - obs_dim: Optional[int], - rand_state: np.random.RandomState = np.random, -) -> np.ndarray: - """Makes an observation matrix with a single observation for each state. - - Args: - n_states (int): Number of states. - is_random (bool): Are observations drawn at random? - If `True`, draw from random normal distribution. - If `False`, are unique one-hot vectors for each state. - obs_dim (int or NoneType): Must be `None` if `is_random == False`. - Otherwise, this must be set to the size of the random vectors. - rand_state (np.random.RandomState): Random number generator. - - Returns: - A matrix of shape `(n_states, obs_dim if is_random else n_states)`. - """ - if not is_random: - assert obs_dim is None - if is_random: - obs_mat = rand_state.normal(0, 2, (n_states, obs_dim)) - else: - obs_mat = np.identity(n_states) - assert ( - obs_mat.ndim == 2 and obs_mat.shape[:1] == (n_states,) and obs_mat.shape[1] > 0 - ) - return obs_mat.astype(np.float32) - - -class RandomMDP(TabularModelEnv): - """AN MDP with a random transition matrix. - - Random matrix is created by `make_random_trans_mat`. - """ - - def __init__( - self, - *, - n_states: int, - n_actions: int, - branch_factor: int, - horizon: int, - random_obs: bool, - obs_dim: Optional[int] = None, - generator_seed: Optional[int] = None, - ): - """Builds RandomMDP. - - Args: - n_states: Number of states. - n_actions: Number of actions. - branch_factor: Maximum number of states that can be reached from - each state-action pair. - horizon: The horizon of the MDP, i.e. the episode length. - random_obs: Whether to use random observations (True) - or one-hot coded (False). - obs_dim: The size of the observation vectors; must be `None` - if `random_obs == False`. - generator_seed: Seed for NumPy RNG. - """ - super().__init__() - # this generator is ONLY for constructing the MDP, not for controlling - # random outcomes during rollouts - rand_gen = np.random.RandomState(generator_seed) - if random_obs: - if obs_dim is None: - obs_dim = n_states - else: - assert obs_dim is None - self._observation_matrix = make_obs_mat( - n_states=n_states, - is_random=random_obs, - obs_dim=obs_dim, - rand_state=rand_gen, - ) - self._transition_matrix = make_random_trans_mat( - n_states=n_states, - n_actions=n_actions, - max_branch_factor=branch_factor, - rand_state=rand_gen, - ) - self._initial_state_dist = make_random_state_dist( - n_avail=branch_factor, - n_states=n_states, - rand_state=rand_gen, - ) - self._horizon = horizon - self._reward_weights = rand_gen.randn(self._observation_matrix.shape[-1]) - self._reward_matrix = self._observation_matrix @ self._reward_weights - assert self._reward_matrix.shape == (self.n_states,) - - @property - def observation_matrix(self): - return self._observation_matrix - - @property - def transition_matrix(self): - return self._transition_matrix - - @property - def reward_matrix(self): - return self._reward_matrix - - @property - def initial_state_dist(self): - return self._initial_state_dist - - @property - def horizon(self): - return self._horizon - - -class CliffWorld(TabularModelEnv): - """A grid world with a goal next to a cliff the agent may fall into. - - Illustration:: - - 0 1 2 3 4 5 6 7 8 9 - +-+-+-+-+-+-+-+-+-+-+ Wind: - 0 |S|C|C|C|C|C|C|C|C|G| - +-+-+-+-+-+-+-+-+-+-+ ^ ^ ^ - 1 | | | | | | | | | | | | | | - +-+-+-+-+-+-+-+-+-+-+ - 2 | | | | | | | | | | | ^ ^ ^ - +-+-+-+-+-+-+-+-+-+-+ | | | - - Aim is to get from S to G. The G square has reward +10, the C squares - ("cliff") have reward -10, and all other squares have reward -1. Agent can - move in all directions (except through walls), but there is 30% chance that - they will be blown upwards by one more unit than intended due to wind. - Optimal policy is to go out a bit and avoid the cliff, but still hit goal - eventually. - """ - - def __init__( - self, - *, - width: int, - height: int, - horizon: int, - use_xy_obs: bool, - rew_default: int = -1, - rew_goal: int = 10, - rew_cliff: int = -10, - fail_p: float = 0.3, - ): - """Builds CliffWorld with specified dimensions and reward.""" - super().__init__() - assert ( - width >= 3 and height >= 2 - ), "degenerate grid world requested; is this a bug?" - self.width = width - self.height = height - succ_p = 1 - fail_p - n_states = width * height - O_mat = self._observation_matrix = np.zeros( - (n_states, 2 if use_xy_obs else n_states), - dtype=np.float32, - ) - R_vec = self._reward_matrix = np.zeros((n_states,)) - T_mat = self._transition_matrix = np.zeros((n_states, 4, n_states)) - self._horizon = horizon - - def to_id_clamp(row, col): - """Convert (x,y) state to state ID, after clamp x & y to lie in grid.""" - row = min(max(row, 0), height - 1) - col = min(max(col, 0), width - 1) - state_id = row * width + col - assert 0 <= state_id < self.n_states - return state_id - - for row in range(height): - for col in range(width): - state_id = to_id_clamp(row, col) - - # start by computing reward - if row > 0: - r = rew_default # blank - elif col == 0: - r = rew_default # start - elif col == width - 1: - r = rew_goal # goal - else: - r = rew_cliff # cliff - R_vec[state_id] = r - - # now compute observation - if use_xy_obs: - # (x, y) coordinate scaled to (0,1) - O_mat[state_id, :] = [ - float(col) / (width - 1), - float(row) / (height - 1), - ] - else: - # our observation matrix is just the identity; observation - # is an indicator vector telling us exactly what state - # we're in - O_mat[state_id, state_id] = 1 - - # finally, compute transition matrix entries for each of the - # four actions - for drow in [-1, 1]: - for dcol in [-1, 1]: - action_id = (drow + 1) + (dcol + 1) // 2 - target_state = to_id_clamp(row + drow, col + dcol) - fail_state = to_id_clamp(row + drow - 1, col + dcol) - T_mat[state_id, action_id, fail_state] += fail_p - T_mat[state_id, action_id, target_state] += succ_p - - assert np.allclose(np.sum(T_mat, axis=-1), 1, rtol=1e-5), ( - "un-normalised matrix %s" % O_mat - ) - - @property - def observation_matrix(self): - return self._observation_matrix - - @property - def transition_matrix(self): - return self._transition_matrix - - @property - def reward_matrix(self): - return self._reward_matrix - - @property - def horizon(self): - return self._horizon - - @property - def initial_state_dist(self): - # always start in s0 - rv = np.zeros((self.n_states,)) - rv[0] = 1.0 - return rv - - def draw_value_vec(self, D) -> None: - """Use matplotlib to plot a vector of values for each state. - - The vector could represent things like reward, occupancy measure, etc. - - Args: - D: the vector to plot. - """ - import matplotlib.pyplot as plt - - grid = D.reshape(self.height, self.width) - plt.imshow(grid) - plt.gca().grid(False) - - -def register_cliff(suffix, kwargs): - gym.register( - f"imitation/CliffWorld{suffix}-v0", - entry_point="imitation.envs.examples.model_envs:CliffWorld", - kwargs=kwargs, - ) - - -for width, height, horizon in [(7, 4, 9), (15, 6, 18), (100, 20, 110)]: - for use_xy in [False, True]: - use_xy_str = "XY" if use_xy else "" - register_cliff( - f"{width}x{height}{use_xy_str}", - kwargs={ - "width": width, - "height": height, - "use_xy_obs": use_xy, - "horizon": horizon, - }, - ) - -# These parameter choices are somewhat arbitrary. -# We anticipate most users will want to construct RandomMDP directly. -gym.register( - "imitation/Random-v0", - entry_point="imitation.envs.examples.model_envs:RandomMDP", - kwargs={ - "n_states": 16, - "n_actions": 3, - "branch_factor": 2, - "horizon": 20, - "random_obs": True, - "obs_dim": 5, - "generator_seed": 42, - }, -) diff --git a/src/imitation/envs/resettable_env.py b/src/imitation/envs/resettable_env.py deleted file mode 100644 index 0e8780d13..000000000 --- a/src/imitation/envs/resettable_env.py +++ /dev/null @@ -1,310 +0,0 @@ -"""Finite-horizon discrete environments with known transition dynamics. - -These are handy when you want to perform exact maxent policy optimisation. -""" - -import abc -from typing import Optional - -import gym -import numpy as np -from gym import spaces -from stable_baselines3.common import vec_env - - -class ResettableEnv(gym.Env, abc.ABC): - """ABC for environments that are resettable. - - Specifically, these environments provide oracle access to sample from the initial - state distribution and transition dynamics, and compute the reward and termination - condition. Almost all simulated environments can meet these criteria. - """ - - def __init__(self): - """Builds a ResettableEnv with all attributes initialized to None.""" - self._pomdp_state_space = None - self._pomdp_observation_space = None - self._action_space = None - self.cur_state = None - self._n_actions_taken = None - self.rand_state: Optional[np.random.RandomState] = None - self.seed() - - @abc.abstractmethod - def initial_state(self): - """Samples from the initial state distribution.""" - - @abc.abstractmethod - def transition(self, state, action): - """Samples from transition distribution.""" - - @abc.abstractmethod - def reward(self, state, action, new_state): - """Computes reward for a given transition.""" - - @abc.abstractmethod - def terminal(self, state, step: int) -> bool: - """Is the state terminal?""" - - @abc.abstractmethod - def obs_from_state(self, state): - """Returns observation produced by a given state.""" - - @property - def pomdp_state_space(self) -> gym.Space: - """The POMDP's state space. - - In fully observable MDPs, `pomdp_state_space == pomdp_observation_space`. - - Returns: - The POMDP state space of this environment. - """ - return self._pomdp_state_space - - @property - def pomdp_observation_space(self) -> gym.Space: - """The POMDP's observation space. - - In fully observable MDPs, `pomdp_state_space == pomdp_observation_space`. - - The actual "observation" returned by step() includes both this *and* the state. - - Returns: - The POMDP observation space of this environment. - """ - return self._pomdp_observation_space - - @property - def observation_space(self) -> gym.Space: - """Combined observation space. - - Dict space, including both the POMDP's state and observation space. - The intention is to support both algorithms that train on the POMDP's - observations, as well as allowing some algorithms to "cheat" and - operate directly on the underlying POMDP states. - - Return type of reset() and component of step(). - - Returns: - The observation space of this environment. - """ - return gym.spaces.Dict( - {"obs": self.pomdp_observation_space, "state": self.pomdp_state_space}, - ) - - @property - def action_space(self) -> gym.Space: - """Action space. - - Parameter type of step(). - - Returns: - The action space of this environment. - """ - return self._action_space - - @property - def n_actions_taken(self) -> int: - """Number of steps taken so far.""" - return self._n_actions_taken - - def seed(self, seed=None): - if seed is None: - # Gym API wants list of seeds to be returned for some reason, so - # generate a seed explicitly in this case - seed = np.random.randint(0, 1 << 31) - self.rand_state = np.random.RandomState(seed) - return [seed] - - def reset(self): - self.cur_state = self.initial_state() - self._n_actions_taken = 0 - obs = self.obs_from_state(self.cur_state) - return {"obs": obs, "state": self.cur_state} - - def step(self, action): - if self.cur_state is None or self._n_actions_taken is None: - raise ValueError("Need to call reset() before first step()") - - old_state = self.cur_state - self.cur_state = self.transition(self.cur_state, action) - obs = self.obs_from_state(self.cur_state) - rew = self.reward(old_state, action, self.cur_state) - self._n_actions_taken += 1 - done = self.terminal(self.cur_state, self._n_actions_taken) - - infos = {"old_state": old_state, "new_state": self.cur_state} - combined_obs = {"obs": obs, "state": self.cur_state} - return combined_obs, rew, done, infos - - -class TabularModelEnv(ResettableEnv, abc.ABC): - """ABC for tabular environments with known dynamics.""" - - def __init__(self): - """Initialise common attributes of all model-based environments. - - Attributes include current state & number of actions taken so far (initial - None, so that error can be thrown if reset() is not called), attributes for - cached observation/action space, and random seed for rollouts. - """ - super().__init__() - - @property - def pomdp_state_space(self) -> gym.Space: - # Construct spaces lazily, so they can depend on properties in subclasses. - if self._pomdp_state_space is None: - self._pomdp_state_space = spaces.Discrete(self.state_dim) - return self._pomdp_state_space - - @property - def pomdp_observation_space(self) -> gym.Space: - # Construct spaces lazily, so they can depend on properties in subclasses. - if self._pomdp_observation_space is None: - self._pomdp_observation_space = spaces.Box( - low=float("-inf"), - high=float("inf"), - shape=(self.obs_dim,), - dtype=self.obs_dtype, - ) - return self._pomdp_observation_space - - @property - def action_space(self) -> gym.Space: - # Construct spaces lazily, so they can depend on properties in subclasses. - if self._action_space is None: - self._action_space = spaces.Discrete(self.n_actions) - return self._action_space - - def initial_state(self): - return self.rand_state.choice(self.n_states, p=self.initial_state_dist) - - def transition(self, state, action): - out_dist = self.transition_matrix[state, action] - choice_states = np.arange(self.n_states) - return int(self.rand_state.choice(choice_states, p=out_dist, size=())) - - def reward(self, state, action, new_state): - reward = self.reward_matrix[state] - assert np.isscalar(reward), reward - return reward - - def terminal(self, state, n_actions_taken): - return n_actions_taken >= self.horizon - - def obs_from_state(self, state): - # Copy so it can't be mutated in-place (updates will be reflected in - # self.observation_matrix!) - obs = self.observation_matrix[state].copy() - assert obs.ndim == 1, obs.shape - return obs - - @property - def n_states(self): - """Number of states in this MDP (int).""" - return self.transition_matrix.shape[0] - - @property - def n_actions(self): - """Number of actions in this MDP (int).""" - return self.transition_matrix.shape[1] - - @property - def state_dim(self): - """Size of state vectors for this MDP.""" - return self.observation_matrix.shape[0] - - @property - def obs_dim(self): - """Size of observation vectors for this MDP.""" - return self.observation_matrix.shape[1] - - @property - def obs_dtype(self): - """Data type of observation vectors (e.g. np.float32).""" - return self.observation_matrix.dtype - - # ############################### # - # METHODS THAT MUST BE OVERRIDDEN # - # ############################### # - - @property - @abc.abstractmethod - def transition_matrix(self): - """3D transition matrix. - - Dimensions correspond to current state, current action, and next state. - - In other words, if `T` is our returned matrix, then `T[s,a,sprime]` is the - chance of transitioning into state `sprime` after taking action `a` in state - `s`. - """ - - @property - @abc.abstractmethod - def observation_matrix(self): - """2D observation matrix. - - Dimensions correspond to current state (first dim) and elements of observation - (second dim). - """ - - @property - @abc.abstractmethod - def reward_matrix(self): - """1D reward matrix with an element corresponding to each state.""" - - @property - @abc.abstractmethod - def horizon(self): - """Number of actions that can be taken in an episode.""" - - @property - @abc.abstractmethod - def initial_state_dist(self): - """1D vector representing a distribution over initial states.""" - return - - -class DictExtractWrapper(vec_env.VecEnvWrapper): - """Extracts key from dict observation of wrapped environment. - - For example, can be used with instances of `ResettableEnv` to extract either - `'obs'` or `'state'` keys as appropriate. This is useful when you want a model - to depend on only one, or if the model does not natively support dict-based - observations. - """ - - def __init__(self, venv: vec_env.VecEnv, key: str): - """Builds DictExtractWrapper. - - Args: - venv: A vectorized environment with dict observation space. - key: The key to extract from observations. - - Raises: - TypeError: The observation space of `venv` is not a dict. - KeyError: `key` is not present in the observation space of `venv`. - """ - if not isinstance(venv.observation_space, gym.spaces.Dict): - raise TypeError( - f"Observation space '{venv.observation_space}' is not dict type.", - ) - if key not in venv.observation_space.spaces: - raise KeyError( - f"Unrecognized '{key}'; valid keys = " - f"{venv.observation_space.spaces.keys()}", - ) - super().__init__(venv=venv, observation_space=venv.observation_space[key]) - self.key = key - - def reset(self): - obs = self.venv.reset() - return obs[self.key] - - def step_wait(self): - obs, rew, dones, infos = self.venv.step_wait() - for info in infos: - if "terminal_observation" in info: - info["terminal_observation"] = info["terminal_observation"][self.key] - return obs[self.key], rew, dones, infos diff --git a/src/imitation/testing/envs.py b/src/imitation/testing/envs.py index b43ab45b5..72978dd1a 100644 --- a/src/imitation/testing/envs.py +++ b/src/imitation/testing/envs.py @@ -17,11 +17,11 @@ def test_model_based(env: gym.Env) -> None: AssertionError if test fails. """ state = env.initial_state() - assert env.pomdp_state_space.contains(state) + assert env.state_space.contains(state) action = env.action_space.sample() new_state = env.transition(state, action) - assert env.pomdp_state_space.contains(new_state) + assert env.state_space.contains(new_state) reward = env.reward(state, action, new_state) assert isinstance(reward, float) @@ -30,6 +30,6 @@ def test_model_based(env: gym.Env) -> None: assert isinstance(done, bool) obs = env.obs_from_state(state) - assert env.pomdp_observation_space.contains(obs) + assert env.observation_space.contains(obs) next_obs = env.obs_from_state(new_state) - assert env.pomdp_observation_space.contains(next_obs) + assert env.observation_space.contains(next_obs) diff --git a/tests/algorithms/test_mce_irl.py b/tests/algorithms/test_mce_irl.py index 7bb9b7560..d4f2f026b 100644 --- a/tests/algorithms/test_mce_irl.py +++ b/tests/algorithms/test_mce_irl.py @@ -7,6 +7,8 @@ import pytest import torch as th from stable_baselines3.common import vec_env +from seals import base_envs as envs +from seals.diagnostics import imitation_examples as imit_envs from imitation.algorithms import base from imitation.algorithms.mce_irl import ( @@ -16,8 +18,6 @@ mce_partition_fh, ) from imitation.data import rollout -from imitation.envs import resettable_env -from imitation.envs.examples import model_envs from imitation.rewards import reward_nets from imitation.util.util import tensor_iter_norm @@ -52,7 +52,7 @@ def test_random_mdp(): horizon = 5 * (i + 1) random_obs = (i % 2) == 0 obs_dim = (i * 3 + 4) ** 2 + i - mdp = model_envs.RandomMDP( + mdp = imit_envs.RandomMDP( n_states=n_states, n_actions=n_actions, branch_factor=branch_factor, @@ -94,7 +94,7 @@ def test_random_mdp(): @pytest.mark.parametrize("discount", DISCOUNT_RATES) def test_policy_om_random_mdp(discount: float): """Test that optimal policy occupancy measure ("om") for a random MDP is sane.""" - mdp = gym.make("imitation/Random-v0") + mdp = gym.make("seals/Random-v0") V, Q, pi = mce_partition_fh(mdp, discount=discount) assert np.all(np.isfinite(V)) assert np.all(np.isfinite(Q)) @@ -116,97 +116,104 @@ def test_policy_om_random_mdp(discount: float): assert np.allclose(np.sum(D), expected_sum) -class ReasonableMDP(resettable_env.TabularModelEnv): +class ReasonablePOMDP(envs.TabularModelPOMDP): """A tabular MDP with sensible parameters.""" - - observation_matrix = np.array( - [ - [3, -5, -1, -1, -4, 5, 3, 0], - # state 1 (top) - [4, -4, 2, 2, -4, -1, -2, -2], - # state 2 (bottom, equiv to top) - [3, -1, 5, -1, 0, 2, -5, 2], - # state 3 (middle, very low reward and so dominated by others) - [-5, -1, 4, 1, 4, 1, 5, 3], - # state 4 (final, all self loops, good reward) - [2, -5, 1, -5, 1, 4, 4, -3], - ], - ) - transition_matrix = np.array( - [ - # transitions out of state 0 + def __init__(self): + observation_matrix = np.array( [ - # action 0: goes to state 1 (sometimes 2) - [0, 0.9, 0.1, 0, 0], - # action 1: goes to state 3 deterministically - [0, 0, 0, 1, 0], - # action 2: goes to state 2 (sometimes 2) - [0, 0.1, 0.9, 0, 0], + [3, -5, -1, -1, -4, 5, 3, 0], + # state 1 (top) + [4, -4, 2, 2, -4, -1, -2, -2], + # state 2 (bottom, equiv to top) + [3, -1, 5, -1, 0, 2, -5, 2], + # state 3 (middle, very low reward and so dominated by others) + [-5, -1, 4, 1, 4, 1, 5, 3], + # state 4 (final, all self loops, good reward) + [2, -5, 1, -5, 1, 4, 4, -3], ], - # transitions out of state 1 - [ - # action 0: goes to state 3 or 4 (sub-optimal) - [0, 0, 0, 0.05, 0.95], - # action 1: goes to state 3 (bad) - [0, 0, 0, 1, 0], - # action 2: goes to state 4 (good!) - [0, 0, 0, 0, 1], - ], - # transitions out of state 2 (basically the same) - [ - # action 0: goes to state 3 or 4 (sub-optimal) - [0, 0, 0, 0.05, 0.95], - # action 1: goes to state 3 (bad) - [0, 0, 0, 1, 0], - # action 2: goes to state 4 (good!) - [0, 0, 0, 0, 1], - ], - # transitions out of state 3 (all go to state 4) + ) + transition_matrix = np.array( [ - # action 0 - [0, 0, 0, 0, 1], - # action 1 - [0, 0, 0, 0, 1], - # action 2 - [0, 0, 0, 0, 1], + # transitions out of state 0 + [ + # action 0: goes to state 1 (sometimes 2) + [0, 0.9, 0.1, 0, 0], + # action 1: goes to state 3 deterministically + [0, 0, 0, 1, 0], + # action 2: goes to state 2 (sometimes 2) + [0, 0.1, 0.9, 0, 0], + ], + # transitions out of state 1 + [ + # action 0: goes to state 3 or 4 (sub-optimal) + [0, 0, 0, 0.05, 0.95], + # action 1: goes to state 3 (bad) + [0, 0, 0, 1, 0], + # action 2: goes to state 4 (good!) + [0, 0, 0, 0, 1], + ], + # transitions out of state 2 (basically the same) + [ + # action 0: goes to state 3 or 4 (sub-optimal) + [0, 0, 0, 0.05, 0.95], + # action 1: goes to state 3 (bad) + [0, 0, 0, 1, 0], + # action 2: goes to state 4 (good!) + [0, 0, 0, 0, 1], + ], + # transitions out of state 3 (all go to state 4) + [ + # action 0 + [0, 0, 0, 0, 1], + # action 1 + [0, 0, 0, 0, 1], + # action 2 + [0, 0, 0, 0, 1], + ], + # transitions out of state 4 (all go back to state 0) + [ + # action 0 + [1, 0, 0, 0, 0], + # action 1 + [1, 0, 0, 0, 0], + # action 2 + [1, 0, 0, 0, 0], + ], ], - # transitions out of state 4 (all go back to state 0) + ) + reward_matrix = np.array( [ - # action 0 - [1, 0, 0, 0, 0], - # action 1 - [1, 0, 0, 0, 0], - # action 2 - [1, 0, 0, 0, 0], + # state 0 (okay reward, but we can't go back so it doesn't matter) + 1, + # states 1 & 2 have same (okay) reward + 2, + 2, + # state 3 has very negative reward (so avoid it!) + -20, + # state 4 has pretty good reward (good enough that we should move out + # of 1 & 2) + 3, ], - ], - ) - reward_matrix = np.array( - [ - # state 0 (okay reward, but we can't go back so it doesn't matter) - 1, - # states 1 & 2 have same (okay) reward - 2, - 2, - # state 3 has very negative reward (so avoid it!) - -20, - # state 4 has pretty good reward (good enough that we should move out - # of 1 & 2) - 3, - ], - ) - # always start in s0 or s4 - initial_state_dist = [0.5, 0, 0, 0, 0.5] - horizon = 20 + ) + # always start in s0 or s4 + initial_state_dist = np.array([0.5, 0.0, 0.0, 0.0, 0.5]) + horizon = 20 + super().__init__( + observation_matrix=observation_matrix, + transition_matrix=transition_matrix, + reward_matrix=reward_matrix, + initial_state_dist=initial_state_dist, + horizon=horizon, + ) @pytest.mark.parametrize("discount", DISCOUNT_RATES) -def test_policy_om_reasonable_mdp(discount: float): +def test_policy_om_reasonable_pomdp(discount: float): # MDP described above - mdp = ReasonableMDP() + pomdp = ReasonablePOMDP() # get policy etc. for our MDP - V, Q, pi = mce_partition_fh(mdp, discount=discount) - Dt, D = mce_occupancy_measures(mdp, pi=pi, discount=discount) + V, Q, pi = mce_partition_fh(pomdp, discount=discount) + Dt, D = mce_occupancy_measures(pomdp, pi=pi, discount=discount) assert np.all(np.isfinite(V)) assert np.all(np.isfinite(Q)) assert np.all(np.isfinite(pi)) @@ -230,7 +237,7 @@ def test_policy_om_reasonable_mdp(discount: float): assert np.all(pi[:19, 1, 2] > pi[:19, 1, 0]) assert np.all(pi[:19, 1, 0] > pi[:19, 1, 1]) # check that Dt[0] matches our initial state dist - assert np.allclose(Dt[0], mdp.initial_state_dist) + assert np.allclose(Dt[0], pomdp.initial_state_dist) def test_tabular_policy(): @@ -298,7 +305,7 @@ def test_tabular_policy_randomness(): def test_mce_irl_demo_formats(): - mdp = model_envs.RandomMDP( + mdp = imit_envs.RandomMDP( n_states=5, n_actions=3, branch_factor=2, @@ -307,8 +314,8 @@ def test_mce_irl_demo_formats(): obs_dim=None, generator_seed=42, ) - venv = vec_env.DummyVecEnv([lambda: mdp]) - state_venv = resettable_env.DictExtractWrapper(venv, "state") + state_env = envs.ExposePOMDPStateWrapper(mdp) + state_venv = vec_env.DummyVecEnv([lambda: state_env] * 4) trajs = rollout.generate_trajectories( policy=None, venv=state_venv, @@ -330,7 +337,7 @@ def test_mce_irl_demo_formats(): th.random.manual_seed(715298) # create reward network so we can be sure it's seeded identically reward_net = reward_nets.BasicRewardNet( - mdp.pomdp_observation_space, + mdp.observation_space, mdp.action_space, use_action=False, use_next_state=False, @@ -362,7 +369,7 @@ def test_mce_irl_reasonable_mdp( th.random.manual_seed(715298) # test MCE IRL on the MDP - mdp = ReasonableMDP() + mdp = ReasonablePOMDP() mdp.seed(715298) # demo occupancy measure @@ -370,7 +377,7 @@ def test_mce_irl_reasonable_mdp( Dt, D = mce_occupancy_measures(mdp, pi=pi, discount=discount) reward_net = reward_nets.BasicRewardNet( - mdp.pomdp_observation_space, + mdp.observation_space, mdp.action_space, use_action=False, use_next_state=False, @@ -384,8 +391,8 @@ def test_mce_irl_reasonable_mdp( # make sure weights have non-insane norm assert tensor_iter_norm(reward_net.parameters()) < 1000 - venv = vec_env.DummyVecEnv([lambda: mdp]) - state_venv = resettable_env.DictExtractWrapper(venv, "state") + state_env = envs.ExposePOMDPStateWrapper(mdp) + state_venv = vec_env.DummyVecEnv([lambda: state_env] * 4) trajs = rollout.generate_trajectories( mce_irl.policy, state_venv, diff --git a/tests/test_envs.py b/tests/test_envs.py index fa53931d1..bec42ba9c 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -6,8 +6,6 @@ from seals.testing import envs as seals_test from stable_baselines3.common import envs, vec_env -# Unused imports is for side-effect of registering environments -from imitation.envs import examples, resettable_env # noqa: F401 from imitation.testing import envs as imitation_test ENV_NAMES = [ @@ -51,17 +49,3 @@ def test_rollout_schema(self, env: gym.Env): def test_render(self, env: gym.Env): """Tests `render()` supports modes specified in environment metadata.""" seals_test.test_render(env, raises_fn=pytest.raises) - - -def test_dict_extract_wrapper(): - """Tests `DictExtractWrapper` input validation and extraction.""" - venv = vec_env.DummyVecEnv([lambda: envs.SimpleMultiObsEnv()]) - with pytest.raises(KeyError, match="Unrecognized .*"): - resettable_env.DictExtractWrapper(venv, "foobar") - wrapped_venv = resettable_env.DictExtractWrapper(venv, "vec") - with pytest.raises(TypeError, match=".* not dict type"): - resettable_env.DictExtractWrapper(wrapped_venv, "foobar") - obs = wrapped_venv.reset() - assert isinstance(obs, np.ndarray) - obs, _, _, _ = wrapped_venv.step([wrapped_venv.action_space.sample()]) - assert isinstance(obs, np.ndarray) From db72dda0a3f6c9b92080458fd7fc26e4dd89706f Mon Sep 17 00:00:00 2001 From: Juan Rocamonde Date: Mon, 29 Aug 2022 15:48:36 +0100 Subject: [PATCH 02/14] Fix bug in test --- tests/algorithms/test_mce_irl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/algorithms/test_mce_irl.py b/tests/algorithms/test_mce_irl.py index d4f2f026b..c9fde6893 100644 --- a/tests/algorithms/test_mce_irl.py +++ b/tests/algorithms/test_mce_irl.py @@ -315,7 +315,7 @@ def test_mce_irl_demo_formats(): generator_seed=42, ) state_env = envs.ExposePOMDPStateWrapper(mdp) - state_venv = vec_env.DummyVecEnv([lambda: state_env] * 4) + state_venv = vec_env.DummyVecEnv([lambda: state_env]) trajs = rollout.generate_trajectories( policy=None, venv=state_venv, @@ -392,7 +392,7 @@ def test_mce_irl_reasonable_mdp( assert tensor_iter_norm(reward_net.parameters()) < 1000 state_env = envs.ExposePOMDPStateWrapper(mdp) - state_venv = vec_env.DummyVecEnv([lambda: state_env] * 4) + state_venv = vec_env.DummyVecEnv([lambda: state_env]) trajs = rollout.generate_trajectories( mce_irl.policy, state_venv, From 14f933d87b1b26b7d3c06c6f477e9ceaeb35fabd Mon Sep 17 00:00:00 2001 From: Juan Rocamonde Date: Mon, 29 Aug 2022 15:53:23 +0100 Subject: [PATCH 03/14] Manually force CI to fetch latest seals changes --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d91ad2f36..c1c41e804 100644 --- a/setup.py +++ b/setup.py @@ -197,7 +197,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str: "tqdm", "scikit-learn>=0.21.2", "seals@git+" - "https://github.com/HumanCompatibleAI/seals.git@imitation-envs-to-seals", + "https://github.com/HumanCompatibleAI/seals.git@b7dc25f78165d5ef87cb4d7d47790e391898d167", STABLE_BASELINES3, # TODO(adam) switch to upstream release if they make it # See https://github.com/IDSIA/sacred/issues/879 From 90764c76ff4a5ff5d6c2082deb1c4112857fa174 Mon Sep 17 00:00:00 2001 From: Juan Rocamonde Date: Tue, 6 Sep 2022 13:31:07 +0200 Subject: [PATCH 04/14] Update code for new seals changes. --- setup.py | 3 ++- src/imitation/algorithms/mce_irl.py | 4 ++-- tests/algorithms/test_mce_irl.py | 4 +++- tests/test_envs.py | 4 +--- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index c1c41e804..244aa5c9e 100644 --- a/setup.py +++ b/setup.py @@ -197,7 +197,8 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str: "tqdm", "scikit-learn>=0.21.2", "seals@git+" - "https://github.com/HumanCompatibleAI/seals.git@b7dc25f78165d5ef87cb4d7d47790e391898d167", + "https://github.com/HumanCompatibleAI/seals.git" + "@cf97099cee047b26bb643dc6ab684aca9bcd83b2", STABLE_BASELINES3, # TODO(adam) switch to upstream release if they make it # See https://github.com/IDSIA/sacred/issues/879 diff --git a/src/imitation/algorithms/mce_irl.py b/src/imitation/algorithms/mce_irl.py index 8e9412be7..4b2391681 100644 --- a/src/imitation/algorithms/mce_irl.py +++ b/src/imitation/algorithms/mce_irl.py @@ -13,8 +13,8 @@ import numpy as np import scipy.special import torch as th -from stable_baselines3.common import policies from seals import base_envs as envs +from stable_baselines3.common import policies from imitation.algorithms import base from imitation.data import rollout, types @@ -360,7 +360,7 @@ def _set_demo_from_obs( else: warnings.warn( "Training MCEIRL with transitions that lack next observation." - "This gwill result in systematically wrong occupancy measure estimates.", + "This will result in systematically wrong occupancy measure estimates.", ) # Normalize occupancy measure estimates diff --git a/tests/algorithms/test_mce_irl.py b/tests/algorithms/test_mce_irl.py index c9fde6893..6447c3521 100644 --- a/tests/algorithms/test_mce_irl.py +++ b/tests/algorithms/test_mce_irl.py @@ -6,9 +6,9 @@ import numpy as np import pytest import torch as th -from stable_baselines3.common import vec_env from seals import base_envs as envs from seals.diagnostics import imitation_examples as imit_envs +from stable_baselines3.common import vec_env from imitation.algorithms import base from imitation.algorithms.mce_irl import ( @@ -118,7 +118,9 @@ def test_policy_om_random_mdp(discount: float): class ReasonablePOMDP(envs.TabularModelPOMDP): """A tabular MDP with sensible parameters.""" + def __init__(self): + """Initialize a ReasonablePOMDP.""" observation_matrix = np.array( [ [3, -5, -1, -1, -4, 5, 3, 0], diff --git a/tests/test_envs.py b/tests/test_envs.py index bec42ba9c..f465b1144 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -1,10 +1,8 @@ -"""Tests for `imitation.envs.*`.""" +"""Tests for seal environments.""" import gym -import numpy as np import pytest from seals.testing import envs as seals_test -from stable_baselines3.common import envs, vec_env from imitation.testing import envs as imitation_test From 94448150f826d6e5edbda96bb955ea0b7b74c847 Mon Sep 17 00:00:00 2001 From: Juan Rocamonde Date: Tue, 6 Sep 2022 15:14:29 +0200 Subject: [PATCH 05/14] Fix notebook example --- examples/6_train_mce.ipynb | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/6_train_mce.ipynb b/examples/6_train_mce.ipynb index d39ef6425..17d060e15 100644 --- a/examples/6_train_mce.ipynb +++ b/examples/6_train_mce.ipynb @@ -18,6 +18,9 @@ { "cell_type": "markdown", "metadata": { + "pycharm": { + "name": "#%% md\n" + } }, "source": [ "Cliffworld is a POMDP, and its \"observations\" consist of the (partial) observations proper and the (full) hidden environment state. We use `DictExtractWrapper` to extract only the hidden states from the environment, turning it into a fully observable MDP to make computing the optimal policy easy." @@ -34,8 +37,10 @@ "outputs": [], "source": [ "from functools import partial\n", + "\n", "from seals import base_envs as envs\n", "from seals.diagnostics.imitation_examples import CliffWorld\n", + "from stable_baselines3.common.vec_env import DummyVecEnv\n", "\n", "from imitation.algorithms.mce_irl import (\n", " MCEIRL,\n", @@ -44,14 +49,12 @@ " TabularPolicy,\n", ")\n", "from imitation.data import rollout\n", - "from stable_baselines3.common.vec_env import DummyVecEnv\n", "from imitation.rewards import reward_nets\n", "\n", - "\n", "env_creator = partial(CliffWorld, height=4, horizon=8, width=7, use_xy_obs=True)\n", "env_single = env_creator()\n", "\n", - "state_env_creator = partial(envs.ExposePOMDPStateWrapper, env_single)\n", + "state_env_creator = lambda: envs.ExposePOMDPStateWrapper(env_creator())\n", "\n", "# This is just a vectorized environment because `generate_trajectories` expects one\n", "state_venv = DummyVecEnv([state_env_creator] * 4)" @@ -293,4 +296,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file From 45fe4c628cce9243adbca02f1aef2f2a3fe9457d Mon Sep 17 00:00:00 2001 From: Juan Rocamonde Date: Thu, 6 Oct 2022 15:45:43 +0100 Subject: [PATCH 06/14] Update seals version --- setup.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 3532c6297..465355489 100644 --- a/setup.py +++ b/setup.py @@ -72,7 +72,6 @@ "sphinx-github-changelog~=1.2.0", "myst-nb==0.16.0", "ipykernel~=6.15.2", - "seals==0.1.2", ] + ATARI_REQUIRE @@ -198,9 +197,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str: "torch>=1.4.0", "tqdm", "scikit-learn>=0.21.2", - "seals@git+" - "https://github.com/HumanCompatibleAI/seals.git" - "@cf97099cee047b26bb643dc6ab684aca9bcd83b2", + "seals==0.1.4", STABLE_BASELINES3, # TODO(adam) switch to upstream release if they make it # See https://github.com/IDSIA/sacred/issues/879 From 7174cfd2119a5ff38f25efd9f2bd8cf490232817 Mon Sep 17 00:00:00 2001 From: Juan Rocamonde Date: Thu, 6 Oct 2022 16:07:53 +0100 Subject: [PATCH 07/14] Replace old seals naming convention --- tests/algorithms/test_mce_irl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/algorithms/test_mce_irl.py b/tests/algorithms/test_mce_irl.py index 6447c3521..37b3c8776 100644 --- a/tests/algorithms/test_mce_irl.py +++ b/tests/algorithms/test_mce_irl.py @@ -7,7 +7,7 @@ import pytest import torch as th from seals import base_envs as envs -from seals.diagnostics import imitation_examples as imit_envs +from seals.diagnostics import random_trans from stable_baselines3.common import vec_env from imitation.algorithms import base @@ -52,7 +52,7 @@ def test_random_mdp(): horizon = 5 * (i + 1) random_obs = (i % 2) == 0 obs_dim = (i * 3 + 4) ** 2 + i - mdp = imit_envs.RandomMDP( + mdp = random_trans.RandomTransitionEnv( n_states=n_states, n_actions=n_actions, branch_factor=branch_factor, @@ -307,7 +307,7 @@ def test_tabular_policy_randomness(): def test_mce_irl_demo_formats(): - mdp = imit_envs.RandomMDP( + mdp = random_trans.RandomTransitionEnv( n_states=5, n_actions=3, branch_factor=2, From ff911bd49802ccd13e2ae1a1a9911727fd25e440 Mon Sep 17 00:00:00 2001 From: Juan Rocamonde Date: Thu, 6 Oct 2022 16:35:57 +0100 Subject: [PATCH 08/14] Fix docs examples --- docs/algorithms/mce_irl.rst | 8 +++++--- docs/tutorials/6_train_mce.ipynb | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/algorithms/mce_irl.rst b/docs/algorithms/mce_irl.rst index 0b2e8ce21..66d988746 100644 --- a/docs/algorithms/mce_irl.rst +++ b/docs/algorithms/mce_irl.rst @@ -13,6 +13,8 @@ Detailed example notebook: :doc:`../tutorials/6_train_mce` from functools import partial + from seals import base_envs as envs + from seals.diagnostics.cliff_world import CliffWorld from stable_baselines3.common.vec_env import DummyVecEnv from imitation.algorithms.mce_irl import ( @@ -21,15 +23,15 @@ Detailed example notebook: :doc:`../tutorials/6_train_mce` mce_partition_fh, ) from imitation.data import rollout - from imitation.envs import resettable_env - from imitation.envs.examples.model_envs import CliffWorld from imitation.rewards import reward_nets env_creator = partial(CliffWorld, height=4, horizon=8, width=7, use_xy_obs=True) env_single = env_creator() + state_env_creator = lambda: envs.ExposePOMDPStateWrapper(env_creator()) + # This is just a vectorized environment because `generate_trajectories` expects one - state_venv = resettable_env.DictExtractWrapper(DummyVecEnv([env_creator] * 4), "state") + state_venv = DummyVecEnv([state_env_creator] * 4) _, _, pi = mce_partition_fh(env_single) diff --git a/docs/tutorials/6_train_mce.ipynb b/docs/tutorials/6_train_mce.ipynb index e38892a0b..0c641e831 100644 --- a/docs/tutorials/6_train_mce.ipynb +++ b/docs/tutorials/6_train_mce.ipynb @@ -40,7 +40,7 @@ "from functools import partial\n", "\n", "from seals import base_envs as envs\n", - "from seals.diagnostics.imitation_examples import CliffWorld\n", + "from seals.diagnostics.cliff_world import CliffWorld\n", "from stable_baselines3.common.vec_env import DummyVecEnv\n", "\n", "from imitation.algorithms.mce_irl import (\n", From a2d0f1824f5ab38f68abecff7bc23e3441978379 Mon Sep 17 00:00:00 2001 From: Juan Rocamonde Date: Thu, 6 Oct 2022 16:49:39 +0100 Subject: [PATCH 09/14] Rename environment identifier --- docs/algorithms/mce_irl.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/algorithms/mce_irl.rst b/docs/algorithms/mce_irl.rst index 66d988746..ff27bf81a 100644 --- a/docs/algorithms/mce_irl.rst +++ b/docs/algorithms/mce_irl.rst @@ -14,7 +14,7 @@ Detailed example notebook: :doc:`../tutorials/6_train_mce` from functools import partial from seals import base_envs as envs - from seals.diagnostics.cliff_world import CliffWorld + from seals.diagnostics.cliff_world import CliffWorldEnv from stable_baselines3.common.vec_env import DummyVecEnv from imitation.algorithms.mce_irl import ( @@ -25,7 +25,7 @@ Detailed example notebook: :doc:`../tutorials/6_train_mce` from imitation.data import rollout from imitation.rewards import reward_nets - env_creator = partial(CliffWorld, height=4, horizon=8, width=7, use_xy_obs=True) + env_creator = partial(CliffWorldEnv, height=4, horizon=8, width=7, use_xy_obs=True) env_single = env_creator() state_env_creator = lambda: envs.ExposePOMDPStateWrapper(env_creator()) From 74b40e65e54b7eb3f8673b0836ac8b25dbdcc7d7 Mon Sep 17 00:00:00 2001 From: Juan Rocamonde Date: Thu, 6 Oct 2022 16:49:47 +0100 Subject: [PATCH 10/14] Rename environment identifier --- docs/tutorials/6_train_mce.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorials/6_train_mce.ipynb b/docs/tutorials/6_train_mce.ipynb index 0c641e831..1f7cee65b 100644 --- a/docs/tutorials/6_train_mce.ipynb +++ b/docs/tutorials/6_train_mce.ipynb @@ -40,7 +40,7 @@ "from functools import partial\n", "\n", "from seals import base_envs as envs\n", - "from seals.diagnostics.cliff_world import CliffWorld\n", + "from seals.diagnostics.cliff_world import CliffWorldEnv\n", "from stable_baselines3.common.vec_env import DummyVecEnv\n", "\n", "from imitation.algorithms.mce_irl import (\n", @@ -52,7 +52,7 @@ "from imitation.data import rollout\n", "from imitation.rewards import reward_nets\n", "\n", - "env_creator = partial(CliffWorld, height=4, horizon=8, width=7, use_xy_obs=True)\n", + "env_creator = partial(CliffWorldEnv, height=4, horizon=8, width=7, use_xy_obs=True)\n", "env_single = env_creator()\n", "\n", "state_env_creator = lambda: envs.ExposePOMDPStateWrapper(env_creator())\n", From dbc4ff8decab1748ca0b18859e507be878ba1770 Mon Sep 17 00:00:00 2001 From: Juan Rocamonde Date: Thu, 6 Oct 2022 17:28:46 +0100 Subject: [PATCH 11/14] Rename wrong attribute --- docs/algorithms/mce_irl.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/algorithms/mce_irl.rst b/docs/algorithms/mce_irl.rst index ff27bf81a..74e93cb94 100644 --- a/docs/algorithms/mce_irl.rst +++ b/docs/algorithms/mce_irl.rst @@ -38,7 +38,7 @@ Detailed example notebook: :doc:`../tutorials/6_train_mce` _, om = mce_occupancy_measures(env_single, pi=pi) reward_net = reward_nets.BasicRewardNet( - env_single.pomdp_observation_space, + env_single.observation_space, env_single.action_space, hid_sizes=[256], use_action=False, From 49094c7ecffdae0833ca79451159430121cdfc8b Mon Sep 17 00:00:00 2001 From: Juan Rocamonde Date: Mon, 10 Oct 2022 18:04:07 +0100 Subject: [PATCH 12/14] Remove empty files from merge conflict --- src/imitation/envs/examples/model_envs.py | 0 src/imitation/envs/resettable_env.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/imitation/envs/examples/model_envs.py delete mode 100644 src/imitation/envs/resettable_env.py diff --git a/src/imitation/envs/examples/model_envs.py b/src/imitation/envs/examples/model_envs.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/imitation/envs/resettable_env.py b/src/imitation/envs/resettable_env.py deleted file mode 100644 index e69de29bb..000000000 From 41d01eecd20b4b58f4005462f2ffdc6fa533b2c5 Mon Sep 17 00:00:00 2001 From: Juan Rocamonde Date: Tue, 11 Oct 2022 14:30:36 +0100 Subject: [PATCH 13/14] Remove unused env testing files --- src/imitation/testing/envs.py | 35 ------------------------- tests/test_envs.py | 49 ----------------------------------- 2 files changed, 84 deletions(-) delete mode 100644 src/imitation/testing/envs.py delete mode 100644 tests/test_envs.py diff --git a/src/imitation/testing/envs.py b/src/imitation/testing/envs.py deleted file mode 100644 index 72978dd1a..000000000 --- a/src/imitation/testing/envs.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Helper methods for tests of custom Gym environments. - -This is used in the imitation test suite and may also be useful for users of this -library. -""" - -import gym - - -def test_model_based(env: gym.Env) -> None: - """Smoke test for each of the ModelBasedEnv methods with type checks. - - Args: - env: The environment to test. - - Raises: - AssertionError if test fails. - """ - state = env.initial_state() - assert env.state_space.contains(state) - - action = env.action_space.sample() - new_state = env.transition(state, action) - assert env.state_space.contains(new_state) - - reward = env.reward(state, action, new_state) - assert isinstance(reward, float) - - done = env.terminal(state, 0) - assert isinstance(done, bool) - - obs = env.obs_from_state(state) - assert env.observation_space.contains(obs) - next_obs = env.obs_from_state(new_state) - assert env.observation_space.contains(next_obs) diff --git a/tests/test_envs.py b/tests/test_envs.py deleted file mode 100644 index 194a0f596..000000000 --- a/tests/test_envs.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Tests for seal environments.""" -from typing import List - -import gym -import pytest -from seals.testing import envs as seals_test - -from imitation.testing import envs as imitation_test - -ENV_NAMES = [ - env_spec.id - for env_spec in gym.envs.registration.registry.all() - if env_spec.id.startswith("imitation/") -] - -DETERMINISTIC_ENVS: List[str] = [] - -env = pytest.fixture(seals_test.make_env_fixture(skip_fn=pytest.skip)) - - -@pytest.mark.parametrize("env_name", ENV_NAMES) -class TestEnvs: - """Battery of simple tests for environments.""" - - def test_seed(self, env, env_name): - seals_test.test_seed(env, env_name, DETERMINISTIC_ENVS) - - def test_premature_step(self, env): - """Test that you must call reset() before calling step().""" - seals_test.test_premature_step( - env, - skip_fn=pytest.skip, - raises_fn=pytest.raises, - ) - - def test_model_based(self, env): - """Smoke test for each of the ModelBasedEnv methods with type checks.""" - if not hasattr(env, "pomdp_state_space"): # pragma: no cover - pytest.skip("This test is only for subclasses of ResettableEnv.") - - imitation_test.test_model_based(env) - - def test_rollout_schema(self, env: gym.Env): - """Tests if environments have correct types on `step()` and `reset()`.""" - seals_test.test_rollout_schema(env) - - def test_render(self, env: gym.Env): - """Tests `render()` supports modes specified in environment metadata.""" - seals_test.test_render(env, raises_fn=pytest.raises) From 95d37ca04335cf16bf4195db1a69566da94c4553 Mon Sep 17 00:00:00 2001 From: Juan Rocamonde Date: Wed, 12 Oct 2022 12:06:02 +0100 Subject: [PATCH 14/14] Remove import rename for seals package --- docs/algorithms/mce_irl.rst | 4 ++-- docs/tutorials/6_train_mce.ipynb | 6 +++--- src/imitation/algorithms/mce_irl.py | 8 ++++---- tests/algorithms/test_mce_irl.py | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/algorithms/mce_irl.rst b/docs/algorithms/mce_irl.rst index 78be84a36..99cc68411 100644 --- a/docs/algorithms/mce_irl.rst +++ b/docs/algorithms/mce_irl.rst @@ -13,7 +13,7 @@ Detailed example notebook: :doc:`../tutorials/6_train_mce` from functools import partial - from seals import base_envs as envs + from seals import base_envs from seals.diagnostics.cliff_world import CliffWorldEnv import numpy as np @@ -32,7 +32,7 @@ Detailed example notebook: :doc:`../tutorials/6_train_mce` env_creator = partial(CliffWorldEnv, height=4, horizon=8, width=7, use_xy_obs=True) env_single = env_creator() - state_env_creator = lambda: envs.ExposePOMDPStateWrapper(env_creator()) + state_env_creator = lambda: base_envs.ExposePOMDPStateWrapper(env_creator()) # This is just a vectorized environment because `generate_trajectories` expects one state_venv = DummyVecEnv([state_env_creator] * 4) diff --git a/docs/tutorials/6_train_mce.ipynb b/docs/tutorials/6_train_mce.ipynb index 66c15cce8..115a1b6e7 100644 --- a/docs/tutorials/6_train_mce.ipynb +++ b/docs/tutorials/6_train_mce.ipynb @@ -25,7 +25,7 @@ "source": [ "from functools import partial\n", "\n", - "from seals import base_envs as envs\n", + "from seals import base_envs\n", "from seals.diagnostics.cliff_world import CliffWorldEnv\n", "from stable_baselines3.common.vec_env import DummyVecEnv\n", "\n", @@ -43,7 +43,7 @@ "env_creator = partial(CliffWorldEnv, height=4, horizon=8, width=7, use_xy_obs=True)\n", "env_single = env_creator()\n", "\n", - "state_env_creator = lambda: envs.ExposePOMDPStateWrapper(env_creator())\n", + "state_env_creator = lambda: base_envs.ExposePOMDPStateWrapper(env_creator())\n", "\n", "# This is just a vectorized environment because `generate_trajectories` expects one\n", "state_venv = DummyVecEnv([state_env_creator] * 4)" @@ -247,4 +247,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/src/imitation/algorithms/mce_irl.py b/src/imitation/algorithms/mce_irl.py index c739ec284..22262a5e1 100644 --- a/src/imitation/algorithms/mce_irl.py +++ b/src/imitation/algorithms/mce_irl.py @@ -13,7 +13,7 @@ import numpy as np import scipy.special import torch as th -from seals import base_envs as envs +from seals import base_envs from stable_baselines3.common import policies from imitation.algorithms import base @@ -24,7 +24,7 @@ def mce_partition_fh( - env: envs.TabularModelPOMDP, + env: base_envs.TabularModelPOMDP, *, reward: Optional[np.ndarray] = None, discount: float = 1.0, @@ -77,7 +77,7 @@ def mce_partition_fh( def mce_occupancy_measures( - env: envs.TabularModelPOMDP, + env: base_envs.TabularModelPOMDP, *, reward: Optional[np.ndarray] = None, pi: Optional[np.ndarray] = None, @@ -257,7 +257,7 @@ class MCEIRL(base.DemonstrationAlgorithm[types.TransitionsMinimal]): def __init__( self, demonstrations: Optional[MCEDemonstrations], - env: envs.TabularModelPOMDP, + env: base_envs.TabularModelPOMDP, reward_net: reward_nets.RewardNet, rng: np.random.Generator, optimizer_cls: Type[th.optim.Optimizer] = th.optim.Adam, diff --git a/tests/algorithms/test_mce_irl.py b/tests/algorithms/test_mce_irl.py index 33d33865a..f8347b46f 100644 --- a/tests/algorithms/test_mce_irl.py +++ b/tests/algorithms/test_mce_irl.py @@ -6,7 +6,7 @@ import numpy as np import pytest import torch as th -from seals import base_envs as envs +from seals import base_envs from seals.diagnostics import random_trans from stable_baselines3.common import vec_env @@ -116,7 +116,7 @@ def test_policy_om_random_mdp(discount: float): assert np.allclose(np.sum(D), expected_sum) -class ReasonablePOMDP(envs.TabularModelPOMDP): +class ReasonablePOMDP(base_envs.TabularModelPOMDP): """A tabular MDP with sensible parameters.""" def __init__(self): @@ -314,7 +314,7 @@ def test_mce_irl_demo_formats(rng): obs_dim=None, generator_seed=42, ) - state_env = envs.ExposePOMDPStateWrapper(mdp) + state_env = base_envs.ExposePOMDPStateWrapper(mdp) state_venv = vec_env.DummyVecEnv([lambda: state_env]) trajs = rollout.generate_trajectories( policy=None, @@ -406,7 +406,7 @@ def test_mce_irl_reasonable_mdp( # make sure weights have non-insane norm assert tensor_iter_norm(reward_net.parameters()) < 1000 - state_env = envs.ExposePOMDPStateWrapper(mdp) + state_env = base_envs.ExposePOMDPStateWrapper(mdp) state_venv = vec_env.DummyVecEnv([lambda: state_env]) trajs = rollout.generate_trajectories( mce_irl.policy,