openai · jkterry1 · May 24, 2022 · Apr 19, 2022
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -11,7 +11,6 @@ jobs:
  - uses: actions/checkout@v2
  - run: |
  docker build -f py.Dockerfile \
- --build-arg MUJOCO_KEY=$MUJOCO_KEY \
  --build-arg PYTHON_VERSION=${{ matrix.python-version }} \
  --tag gym-docker .
  - name: Run tests

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,7 +9,7 @@ repos:
  hooks:
  - id: codespell
  args:
- - --ignore-words-list=nd,reacher,thist,ths
+ - --ignore-words-list=nd,reacher,thist,ths, ure, referenc
  - repo: https://gitlab.com/PyCQA/flake8
  rev: 4.0.1
  hooks:

diff --git a/README.md b/README.md
@@ -46,6 +46,11 @@ env.close()
 
 Gym keeps strict versioning for reproducibility reasons. All environments end in a suffix like "\_v0". When changes are made to environments that might impact learning results, the number is increased by one to prevent potential confusion.
 
+## MuJoCo Environments
+
+The latest "\_v4" and future versions of the MuJoCo environments will no longer depend on `mujoco-py`. Instead `mujoco` will be the required dependency for future gym MuJoCo environment versions. Old gym MuJoCo environment versions that depend on `mujoco-py` will still be kept but unmaintained.
+To install the dependencies for the latest gym MuJoCo environments use `pip install gym[mujoco]`. Dependencies for old MuJoCo environments can still be installed by `pip install gym[mujoco_py]`. 
+
 ## Citation
 
 A whitepaper from when Gym just came out is available https://arxiv.org/pdf/1606.01540, and can be cited with the following bibtex entry:

diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py
@@ -162,27 +162,55 @@
  reward_threshold=-3.75,
 )
 
+register(
+ id="Reacher-v4",
+ entry_point="gym.envs.mujoco.reacher_v4:ReacherEnv",
+ max_episode_steps=50,
+ reward_threshold=-3.75,
+)
+
 register(
  id="Pusher-v2",
  entry_point="gym.envs.mujoco:PusherEnv",
  max_episode_steps=100,
  reward_threshold=0.0,
 )
 
+register(
+ id="Pusher-v4",
+ entry_point="gym.envs.mujoco.pusher_v4:PusherEnv",
+ max_episode_steps=100,
+ reward_threshold=0.0,
+)
+
 register(
  id="InvertedPendulum-v2",
  entry_point="gym.envs.mujoco:InvertedPendulumEnv",
  max_episode_steps=1000,
  reward_threshold=950.0,
 )
 
+register(
+ id="InvertedPendulum-v4",
+ entry_point="gym.envs.mujoco.inverted_pendulum_v4:InvertedPendulumEnv",
+ max_episode_steps=1000,
+ reward_threshold=950.0,
+)
+
 register(
  id="InvertedDoublePendulum-v2",
  entry_point="gym.envs.mujoco:InvertedDoublePendulumEnv",
  max_episode_steps=1000,
  reward_threshold=9100.0,
 )
 
+register(
+ id="InvertedDoublePendulum-v4",
+ entry_point="gym.envs.mujoco.inverted_double_pendulum_v4:InvertedDoublePendulumEnv",
+ max_episode_steps=1000,
+ reward_threshold=9100.0,
+)
+
 register(
  id="HalfCheetah-v2",
  entry_point="gym.envs.mujoco:HalfCheetahEnv",
@@ -197,6 +225,13 @@
  reward_threshold=4800.0,
 )
 
+register(
+ id="HalfCheetah-v4",
+ entry_point="gym.envs.mujoco.half_cheetah_v4:HalfCheetahEnv",
+ max_episode_steps=1000,
+ reward_threshold=4800.0,
+)
+
 register(
  id="Hopper-v2",
  entry_point="gym.envs.mujoco:HopperEnv",
@@ -211,6 +246,13 @@
  reward_threshold=3800.0,
 )
 
+register(
+ id="Hopper-v4",
+ entry_point="gym.envs.mujoco.hopper_v4:HopperEnv",
+ max_episode_steps=1000,
+ reward_threshold=3800.0,
+)
+
 register(
  id="Swimmer-v2",
  entry_point="gym.envs.mujoco:SwimmerEnv",
@@ -225,6 +267,13 @@
  reward_threshold=360.0,
 )
 
+register(
+ id="Swimmer-v4",
+ entry_point="gym.envs.mujoco.swimmer_v4:SwimmerEnv",
+ max_episode_steps=1000,
+ reward_threshold=360.0,
+)
+
 register(
  id="Walker2d-v2",
  max_episode_steps=1000,
@@ -237,6 +286,12 @@
  entry_point="gym.envs.mujoco.walker2d_v3:Walker2dEnv",
 )
 
+register(
+ id="Walker2d-v4",
+ max_episode_steps=1000,
+ entry_point="gym.envs.mujoco.walker2d_v4:Walker2dEnv",
+)
+
 register(
  id="Ant-v2",
  entry_point="gym.envs.mujoco:AntEnv",
@@ -251,6 +306,13 @@
  reward_threshold=6000.0,
 )
 
+register(
+ id="Ant-v4",
+ entry_point="gym.envs.mujoco.ant_v4:AntEnv",
+ max_episode_steps=1000,
+ reward_threshold=6000.0,
+)
+
 register(
  id="Humanoid-v2",
  entry_point="gym.envs.mujoco:HumanoidEnv",
@@ -263,8 +325,20 @@
  max_episode_steps=1000,
 )
 
+register(
+ id="Humanoid-v4",
+ entry_point="gym.envs.mujoco.humanoid_v4:HumanoidEnv",
+ max_episode_steps=1000,
+)
+
 register(
  id="HumanoidStandup-v2",
  entry_point="gym.envs.mujoco:HumanoidStandupEnv",
  max_episode_steps=1000,
 )
+
+register(
+ id="HumanoidStandup-v4",
+ entry_point="gym.envs.mujoco.humanoidstandup_v4:HumanoidStandupEnv",
+ max_episode_steps=1000,
+)
diff --git a/gym/envs/mujoco/__init__.py b/gym/envs/mujoco/__init__.py
@@ -9,6 +9,7 @@
 from gym.envs.mujoco.humanoidstandup import HumanoidStandupEnv
 from gym.envs.mujoco.inverted_double_pendulum import InvertedDoublePendulumEnv
 from gym.envs.mujoco.inverted_pendulum import InvertedPendulumEnv
+from gym.envs.mujoco.mujoco_rendering import RenderContextOffscreen, Viewer
 from gym.envs.mujoco.pusher import PusherEnv
 from gym.envs.mujoco.reacher import ReacherEnv
 from gym.envs.mujoco.swimmer import SwimmerEnv

diff --git a/gym/envs/mujoco/ant.py b/gym/envs/mujoco/ant.py
@@ -6,7 +6,7 @@
 
 class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  def __init__(self):
- mujoco_env.MujocoEnv.__init__(self, "ant.xml", 5)
+ mujoco_env.MujocoEnv.__init__(self, "ant.xml", 5, mujoco_bindings="mujoco_py")
  utils.EzPickle.__init__(self)
 
  def step(self, a):

diff --git a/gym/envs/mujoco/ant_v3.py b/gym/envs/mujoco/ant_v3.py
@@ -9,167 +9,6 @@
 
 
 class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
- """
- ### Description
-
- This environment is based on the environment introduced by Schulman,
- Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control
- Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438).
- The ant is a 3D robot consisting of one torso (free rotational body) with
- four legs attached to it with each leg having two links. The goal is to
- coordinate the four legs to move in the forward (right) direction by applying
- torques on the eight hinges connecting the two links of each leg and the torso
- (nine parts and eight hinges).
-
- ### Action Space
- The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints.
-
- | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
- |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
- | 0 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) |
- | 1 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) |
- | 2 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) |
- | 3 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) |
- | 4 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) |
- | 5 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) |
- | 6 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) |
- | 7 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) |
-
- ### Observation Space
-
- Observations consist of positional values of different body parts of the ant,
- followed by the velocities of those individual parts (their derivatives) with all
- the positions ordered before all the velocities.
-
- By default, observations do not include the x- and y-coordinates of the ant's torso. These may
- be included by passing `exclude_current_positions_from_observation=False` during construction.
- In that case, the observation space will have 113 dimensions where the first two dimensions
- represent the x- and y- coordinates of the ant's torso.
- Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
- of the torso will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
-
- However, by default, an observation is a `ndarray` with shape `(111,)`
- where the elements correspond to the following:
-
- | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
- |-----|-------------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
- | 0 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) |
- | 1 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
- | 2 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
- | 3 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
- | 4 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
- | 5 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) |
- | 6 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) |
- | 7 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) |
- | 8 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) |
- | 9 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) |
- | 10 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) |
- | 11 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) |
- | 12 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) |
- | 13 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
- | 14 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
- | 15 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
- | 16 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
- | 17 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
- | 18 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
- | 19 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) |
- | 20 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) |
- | 21 | angular velocity of angle between torso and front right link| -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) |
- | 22 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) |
- | 23 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) |
- | 24 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) |
- | 25 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) |
- | 26 |angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) |
-
-
- The remaining 14*6 = 84 elements of the observation are contact forces
- (external forces - force x, y, z and torque x, y, z) applied to the
- center of mass of each of the links. The 14 links are: the ground link,
- the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces.
-
- The (x,y,z) coordinates are translational DOFs while the orientations are rotational
- DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
-
-
- **Note:** There have been reported issues that using a Mujoco-Py version > 2.0 results
- in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
- when using the Ant environment if you would like to report results with contact forces (if
- contact forces are not used in your experiments, you can use version > 2.0).
-
- ### Rewards
- The reward consists of three parts:
- - *healthy_reward*: Every timestep that the ant is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`
- - *forward_reward*: A reward of moving forward which is measured as
- *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time
- between actions and is dependent on the `frame_skip` parameter (default is 5),
- where the frametime is 0.01 - making the default *dt = 5 * 0.01 = 0.05*.
- This reward would be positive if the ant moves forward (in positive x direction).
- - *ctrl_cost*: A negative reward for penalising the ant if it takes actions
- that are too large. It is measured as *`ctrl_cost_weight` * sum(action<sup>2</sup>)*
- where *`ctr_cost_weight`* is a parameter set for the control and has a default value of 0.5.
- - *contact_cost*: A negative reward for penalising the ant if the external contact
- force is too large. It is calculated *`contact_cost_weight` * sum(clip(external contact
- force to `contact_force_range`)<sup>2</sup>)*.
-
- The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms.
-
- ### Starting State
- All observations start in state
- (0.0, 0.0, 0.75, 1.0, 0.0 ... 0.0) with a uniform noise in the range
- of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional values and standard normal noise
- with mean 0 and standard deviation `reset_noise_scale` added to the velocity values for
- stochasticity. Note that the initial z coordinate is intentionally selected
- to be slightly high, thereby indicating a standing up ant. The initial orientation
- is designed to make it face forward as well.
-
- ### Episode Termination
- The ant is said to be unhealthy if any of the following happens:
-
- 1. Any of the state space values is no longer finite
- 2. The z-coordinate of the torso is **not** in the closed interval given by `healthy_z_range` (defaults to [0.2, 1.0])
-
- If `terminate_when_unhealthy=True` is passed during construction (which is the default),
- the episode terminates when any of the following happens:
-
- 1. The episode duration reaches a 1000 timesteps
- 2. The ant is unhealthy
-
- If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
-
- ### Arguments
-
- No additional arguments are currently supported in v2 and lower.
-
- ```
- env = gym.make('Ant-v2')
- ```
-
- v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
-
- ```
- env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ...)
- ```
-
- | Parameter | Type | Default |Description |
- |-------------------------|------------|--------------|-------------------------------|
- | `xml_file` | **str** | `"ant.xml"` | Path to a MuJoCo model |
- | `ctrl_cost_weight` | **float** | `0.5` | Weight for *ctrl_cost* term (see section on reward) |
- | `contact_cost_weight` | **float** | `5e-4` | Weight for *contact_cost* term (see section on reward) |
- | `healthy_reward` | **float** | `1` | Constant reward given if the ant is "healthy" after timestep |
- | `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` |
- | `healthy_z_range` | **tuple** | `(0.2, 1)` | The ant is considered healthy if the z-coordinate of the torso is in this range |
- | `contact_force_range` | **tuple** | `(-1, 1)` | Contact forces are clipped to this range in the computation of *contact_cost* |
- | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) |
- | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
-
- ### Version History
-
- * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
- * v2: All continuous control environments now use mujoco_py >= 1.50
- * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
- * v0: Initial versions release (1.0.0)
- """
-
  def __init__(
  self,
  xml_file="ant.xml",
@@ -199,7 +38,7 @@ def __init__(
  exclude_current_positions_from_observation
  )
 
- mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
+ mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py")
 
  @property
  def healthy_reward(self):