diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e48b065e4ea..4c376ad0c26 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -11,7 +11,6 @@ jobs:
- uses: actions/checkout@v2
- run: |
docker build -f py.Dockerfile \
- --build-arg MUJOCO_KEY=$MUJOCO_KEY \
--build-arg PYTHON_VERSION=${{ matrix.python-version }} \
--tag gym-docker .
- name: Run tests
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ea16d0eae53..89a02ac19f8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -9,7 +9,7 @@ repos:
hooks:
- id: codespell
args:
- - --ignore-words-list=nd,reacher,thist,ths
+ - --ignore-words-list=nd,reacher,thist,ths, ure, referenc
- repo: https://gitlab.com/PyCQA/flake8
rev: 4.0.1
hooks:
diff --git a/README.md b/README.md
index 37b63f2d162..d6649dfd2dd 100644
--- a/README.md
+++ b/README.md
@@ -46,6 +46,11 @@ env.close()
Gym keeps strict versioning for reproducibility reasons. All environments end in a suffix like "\_v0". When changes are made to environments that might impact learning results, the number is increased by one to prevent potential confusion.
+## MuJoCo Environments
+
+The latest "\_v4" and future versions of the MuJoCo environments will no longer depend on `mujoco-py`. Instead `mujoco` will be the required dependency for future gym MuJoCo environment versions. Old gym MuJoCo environment versions that depend on `mujoco-py` will still be kept but unmaintained.
+To install the dependencies for the latest gym MuJoCo environments use `pip install gym[mujoco]`. Dependencies for old MuJoCo environments can still be installed by `pip install gym[mujoco_py]`.
+
## Citation
A whitepaper from when Gym just came out is available https://arxiv.org/pdf/1606.01540, and can be cited with the following bibtex entry:
diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py
index 48a208ded37..3acec6db18c 100644
--- a/gym/envs/__init__.py
+++ b/gym/envs/__init__.py
@@ -162,6 +162,13 @@
reward_threshold=-3.75,
)
+register(
+ id="Reacher-v4",
+ entry_point="gym.envs.mujoco.reacher_v4:ReacherEnv",
+ max_episode_steps=50,
+ reward_threshold=-3.75,
+)
+
register(
id="Pusher-v2",
entry_point="gym.envs.mujoco:PusherEnv",
@@ -169,6 +176,13 @@
reward_threshold=0.0,
)
+register(
+ id="Pusher-v4",
+ entry_point="gym.envs.mujoco.pusher_v4:PusherEnv",
+ max_episode_steps=100,
+ reward_threshold=0.0,
+)
+
register(
id="InvertedPendulum-v2",
entry_point="gym.envs.mujoco:InvertedPendulumEnv",
@@ -176,6 +190,13 @@
reward_threshold=950.0,
)
+register(
+ id="InvertedPendulum-v4",
+ entry_point="gym.envs.mujoco.inverted_pendulum_v4:InvertedPendulumEnv",
+ max_episode_steps=1000,
+ reward_threshold=950.0,
+)
+
register(
id="InvertedDoublePendulum-v2",
entry_point="gym.envs.mujoco:InvertedDoublePendulumEnv",
@@ -183,6 +204,13 @@
reward_threshold=9100.0,
)
+register(
+ id="InvertedDoublePendulum-v4",
+ entry_point="gym.envs.mujoco.inverted_double_pendulum_v4:InvertedDoublePendulumEnv",
+ max_episode_steps=1000,
+ reward_threshold=9100.0,
+)
+
register(
id="HalfCheetah-v2",
entry_point="gym.envs.mujoco:HalfCheetahEnv",
@@ -197,6 +225,13 @@
reward_threshold=4800.0,
)
+register(
+ id="HalfCheetah-v4",
+ entry_point="gym.envs.mujoco.half_cheetah_v4:HalfCheetahEnv",
+ max_episode_steps=1000,
+ reward_threshold=4800.0,
+)
+
register(
id="Hopper-v2",
entry_point="gym.envs.mujoco:HopperEnv",
@@ -211,6 +246,13 @@
reward_threshold=3800.0,
)
+register(
+ id="Hopper-v4",
+ entry_point="gym.envs.mujoco.hopper_v4:HopperEnv",
+ max_episode_steps=1000,
+ reward_threshold=3800.0,
+)
+
register(
id="Swimmer-v2",
entry_point="gym.envs.mujoco:SwimmerEnv",
@@ -225,6 +267,13 @@
reward_threshold=360.0,
)
+register(
+ id="Swimmer-v4",
+ entry_point="gym.envs.mujoco.swimmer_v4:SwimmerEnv",
+ max_episode_steps=1000,
+ reward_threshold=360.0,
+)
+
register(
id="Walker2d-v2",
max_episode_steps=1000,
@@ -237,6 +286,12 @@
entry_point="gym.envs.mujoco.walker2d_v3:Walker2dEnv",
)
+register(
+ id="Walker2d-v4",
+ max_episode_steps=1000,
+ entry_point="gym.envs.mujoco.walker2d_v4:Walker2dEnv",
+)
+
register(
id="Ant-v2",
entry_point="gym.envs.mujoco:AntEnv",
@@ -251,6 +306,13 @@
reward_threshold=6000.0,
)
+register(
+ id="Ant-v4",
+ entry_point="gym.envs.mujoco.ant_v4:AntEnv",
+ max_episode_steps=1000,
+ reward_threshold=6000.0,
+)
+
register(
id="Humanoid-v2",
entry_point="gym.envs.mujoco:HumanoidEnv",
@@ -263,8 +325,20 @@
max_episode_steps=1000,
)
+register(
+ id="Humanoid-v4",
+ entry_point="gym.envs.mujoco.humanoid_v4:HumanoidEnv",
+ max_episode_steps=1000,
+)
+
register(
id="HumanoidStandup-v2",
entry_point="gym.envs.mujoco:HumanoidStandupEnv",
max_episode_steps=1000,
)
+
+register(
+ id="HumanoidStandup-v4",
+ entry_point="gym.envs.mujoco.humanoidstandup_v4:HumanoidStandupEnv",
+ max_episode_steps=1000,
+)
diff --git a/gym/envs/mujoco/__init__.py b/gym/envs/mujoco/__init__.py
index fc4cc058be1..57658054276 100644
--- a/gym/envs/mujoco/__init__.py
+++ b/gym/envs/mujoco/__init__.py
@@ -9,6 +9,7 @@
from gym.envs.mujoco.humanoidstandup import HumanoidStandupEnv
from gym.envs.mujoco.inverted_double_pendulum import InvertedDoublePendulumEnv
from gym.envs.mujoco.inverted_pendulum import InvertedPendulumEnv
+from gym.envs.mujoco.mujoco_rendering import RenderContextOffscreen, Viewer
from gym.envs.mujoco.pusher import PusherEnv
from gym.envs.mujoco.reacher import ReacherEnv
from gym.envs.mujoco.swimmer import SwimmerEnv
diff --git a/gym/envs/mujoco/ant.py b/gym/envs/mujoco/ant.py
index e61b787db98..70728dee5ea 100644
--- a/gym/envs/mujoco/ant.py
+++ b/gym/envs/mujoco/ant.py
@@ -6,7 +6,7 @@
class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
def __init__(self):
- mujoco_env.MujocoEnv.__init__(self, "ant.xml", 5)
+ mujoco_env.MujocoEnv.__init__(self, "ant.xml", 5, mujoco_bindings="mujoco_py")
utils.EzPickle.__init__(self)
def step(self, a):
diff --git a/gym/envs/mujoco/ant_v3.py b/gym/envs/mujoco/ant_v3.py
index aeffa507523..4260fd7d2f0 100644
--- a/gym/envs/mujoco/ant_v3.py
+++ b/gym/envs/mujoco/ant_v3.py
@@ -9,167 +9,6 @@
class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
- """
- ### Description
-
- This environment is based on the environment introduced by Schulman,
- Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control
- Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438).
- The ant is a 3D robot consisting of one torso (free rotational body) with
- four legs attached to it with each leg having two links. The goal is to
- coordinate the four legs to move in the forward (right) direction by applying
- torques on the eight hinges connecting the two links of each leg and the torso
- (nine parts and eight hinges).
-
- ### Action Space
- The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints.
-
- | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
- |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
- | 0 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) |
- | 1 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) |
- | 2 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) |
- | 3 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) |
- | 4 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) |
- | 5 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) |
- | 6 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) |
- | 7 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) |
-
- ### Observation Space
-
- Observations consist of positional values of different body parts of the ant,
- followed by the velocities of those individual parts (their derivatives) with all
- the positions ordered before all the velocities.
-
- By default, observations do not include the x- and y-coordinates of the ant's torso. These may
- be included by passing `exclude_current_positions_from_observation=False` during construction.
- In that case, the observation space will have 113 dimensions where the first two dimensions
- represent the x- and y- coordinates of the ant's torso.
- Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
- of the torso will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
-
- However, by default, an observation is a `ndarray` with shape `(111,)`
- where the elements correspond to the following:
-
- | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
- |-----|-------------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
- | 0 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) |
- | 1 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
- | 2 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
- | 3 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
- | 4 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
- | 5 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) |
- | 6 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) |
- | 7 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) |
- | 8 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) |
- | 9 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) |
- | 10 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) |
- | 11 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) |
- | 12 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) |
- | 13 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
- | 14 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
- | 15 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
- | 16 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
- | 17 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
- | 18 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
- | 19 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) |
- | 20 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) |
- | 21 | angular velocity of angle between torso and front right link| -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) |
- | 22 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) |
- | 23 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) |
- | 24 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) |
- | 25 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) |
- | 26 |angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) |
-
-
- The remaining 14*6 = 84 elements of the observation are contact forces
- (external forces - force x, y, z and torque x, y, z) applied to the
- center of mass of each of the links. The 14 links are: the ground link,
- the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces.
-
- The (x,y,z) coordinates are translational DOFs while the orientations are rotational
- DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
-
-
- **Note:** There have been reported issues that using a Mujoco-Py version > 2.0 results
- in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
- when using the Ant environment if you would like to report results with contact forces (if
- contact forces are not used in your experiments, you can use version > 2.0).
-
- ### Rewards
- The reward consists of three parts:
- - *healthy_reward*: Every timestep that the ant is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`
- - *forward_reward*: A reward of moving forward which is measured as
- *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time
- between actions and is dependent on the `frame_skip` parameter (default is 5),
- where the frametime is 0.01 - making the default *dt = 5 * 0.01 = 0.05*.
- This reward would be positive if the ant moves forward (in positive x direction).
- - *ctrl_cost*: A negative reward for penalising the ant if it takes actions
- that are too large. It is measured as *`ctrl_cost_weight` * sum(action2)*
- where *`ctr_cost_weight`* is a parameter set for the control and has a default value of 0.5.
- - *contact_cost*: A negative reward for penalising the ant if the external contact
- force is too large. It is calculated *`contact_cost_weight` * sum(clip(external contact
- force to `contact_force_range`)2)*.
-
- The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms.
-
- ### Starting State
- All observations start in state
- (0.0, 0.0, 0.75, 1.0, 0.0 ... 0.0) with a uniform noise in the range
- of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional values and standard normal noise
- with mean 0 and standard deviation `reset_noise_scale` added to the velocity values for
- stochasticity. Note that the initial z coordinate is intentionally selected
- to be slightly high, thereby indicating a standing up ant. The initial orientation
- is designed to make it face forward as well.
-
- ### Episode Termination
- The ant is said to be unhealthy if any of the following happens:
-
- 1. Any of the state space values is no longer finite
- 2. The z-coordinate of the torso is **not** in the closed interval given by `healthy_z_range` (defaults to [0.2, 1.0])
-
- If `terminate_when_unhealthy=True` is passed during construction (which is the default),
- the episode terminates when any of the following happens:
-
- 1. The episode duration reaches a 1000 timesteps
- 2. The ant is unhealthy
-
- If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded.
-
- ### Arguments
-
- No additional arguments are currently supported in v2 and lower.
-
- ```
- env = gym.make('Ant-v2')
- ```
-
- v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
-
- ```
- env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ...)
- ```
-
- | Parameter | Type | Default |Description |
- |-------------------------|------------|--------------|-------------------------------|
- | `xml_file` | **str** | `"ant.xml"` | Path to a MuJoCo model |
- | `ctrl_cost_weight` | **float** | `0.5` | Weight for *ctrl_cost* term (see section on reward) |
- | `contact_cost_weight` | **float** | `5e-4` | Weight for *contact_cost* term (see section on reward) |
- | `healthy_reward` | **float** | `1` | Constant reward given if the ant is "healthy" after timestep |
- | `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` |
- | `healthy_z_range` | **tuple** | `(0.2, 1)` | The ant is considered healthy if the z-coordinate of the torso is in this range |
- | `contact_force_range` | **tuple** | `(-1, 1)` | Contact forces are clipped to this range in the computation of *contact_cost* |
- | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) |
- | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
-
- ### Version History
-
- * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
- * v2: All continuous control environments now use mujoco_py >= 1.50
- * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
- * v0: Initial versions release (1.0.0)
- """
-
def __init__(
self,
xml_file="ant.xml",
@@ -199,7 +38,7 @@ def __init__(
exclude_current_positions_from_observation
)
- mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
+ mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py")
@property
def healthy_reward(self):
diff --git a/gym/envs/mujoco/ant_v4.py b/gym/envs/mujoco/ant_v4.py
new file mode 100644
index 00000000000..0b8f2bbd785
--- /dev/null
+++ b/gym/envs/mujoco/ant_v4.py
@@ -0,0 +1,308 @@
+import numpy as np
+
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+DEFAULT_CAMERA_CONFIG = {
+ "distance": 4.0,
+}
+
+
+class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+ """
+ ### Description
+
+ This environment is based on the environment introduced by Schulman,
+ Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control
+ Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438).
+ The ant is a 3D robot consisting of one torso (free rotational body) with
+ four legs attached to it with each leg having two links. The goal is to
+ coordinate the four legs to move in the forward (right) direction by applying
+ torques on the eight hinges connecting the two links of each leg and the torso
+ (nine parts and eight hinges).
+
+ ### Action Space
+ The agent take a 8-element vector for actions.
+
+ The action space is a continuous `(action, action, action, action, action, action,
+ action, action)` all in `[-1, 1]`, where `action` represents the numerical torques
+ applied at the hinge joints.
+
+ | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
+ |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
+ | 0 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) |
+ | 1 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) |
+ | 2 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) |
+ | 3 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) |
+ | 4 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) |
+ | 5 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) |
+ | 6 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) |
+ | 7 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) |
+
+ ### Observation Space
+
+ The state space consists of positional values of different body parts of the ant,
+ followed by the velocities of those individual parts (their derivatives) with all
+ the positions ordered before all the velocities.
+
+ The observation is a `ndarray` with shape `(111,)` where the elements correspond to the following:
+
+ | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit |
+ |-----|-------------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------|
+ | 0 | x-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) |
+ | 1 | y-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) |
+ | 2 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) |
+ | 3 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
+ | 4 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
+ | 5 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
+ | 6 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) |
+ | 7 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) |
+ | 8 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) |
+ | 9 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) |
+ | 10 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) |
+ | 11 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) |
+ | 12 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) |
+ | 13 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) |
+ | 14 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) |
+ | 15 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
+ | 16 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
+ | 17 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) |
+ | 18 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
+ | 19 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
+ | 20 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) |
+ | 21 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) |
+ | 22 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) |
+ | 23 | angular velocity of angle between torso and front right link| -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) |
+ | 24 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) |
+ | 25 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) |
+ | 26 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) |
+ | 27 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) |
+ | 28 |angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) |
+
+
+ The remaining 14*6 = 84 elements in the state are contact forces
+ (external forces - force x, y, z and torque x, y, z) applied to the
+ center of mass of each of the links. The 14 links are: the ground link,
+ the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces.
+
+ The (x,y,z) coordinates are translational DOFs while the orientations are rotational
+ DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
+
+ **Note:** There are 29 elements in the table above - giving rise to `(113,)` elements
+ in the state space. In practice (and Gym implementation), the first two positional
+ elements are omitted from the state space since the reward function is calculated based
+ on the x-coordinate value. This value is hidden from the algorithm, which in turn has to
+ develop an abstract understanding of it from the observed rewards. Therefore, observation
+ space has shape `(111,)` instead of `(113,)` and the table should not have the first two rows.
+
+ **Note:** Ant-v4 environment no longer has the following contact forces issue.
+ If using previous Ant versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results
+ in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
+ when using the Ant environment if you would like to report results with contact forces (if
+ contact forces are not used in your experiments, you can use version > 2.0).
+
+ **Note:** Ant-v4 has the option of including contact forces in the observation space. To add contact forces set the argument
+ 'use_contact_forces" to True. The default value is False. Also note that training including contact forces can perform worse
+ than not using them as shown in (https://github.com/openai/gym/pull/2762).
+
+ ### Rewards
+ The reward consists of three parts:
+ - *survive_reward*: Every timestep that the ant is alive, it gets a reward of 1.
+ - *forward_reward*: A reward of moving forward which is measured as
+ *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time
+ between actions and is dependent on the frame_skip parameter (default is 5),
+ where the *dt* for one frame is 0.01 - making the default *dt = 5 * 0.01 = 0.05*.
+ This reward would be positive if the ant moves forward (right) desired.
+ - *ctrl_cost*: A negative reward for penalising the ant if it takes actions
+ that are too large. It is measured as *coefficient **x** sum(action2)*
+ where *coefficient* is a parameter set for the control and has a default value of 0.5.
+ - *contact_cost*: A negative reward for penalising the ant if the external contact
+ force is too large. It is calculated *0.5 * 0.001 * sum(clip(external contact
+ force to [-1,1])2)*.
+
+ The total reward returned is ***reward*** *=* *alive survive_reward + forward_reward - ctrl_cost - contact_cost*
+
+ ### Starting State
+ All observations start in state
+ (0.0, 0.0, 0.75, 1.0, 0.0 ... 0.0) with a uniform noise in the range
+ of [-0.1, 0.1] added to the positional values and standard normal noise
+ with 0 mean and 0.1 standard deviation added to the velocity values for
+ stochasticity. Note that the initial z coordinate is intentionally selected
+ to be slightly high, thereby indicating a standing up ant. The initial orientation
+ is designed to make it face forward as well.
+
+ ### Episode Termination
+ The episode terminates when any of the following happens:
+
+ 1. The episode duration reaches a 1000 timesteps
+ 2. Any of the state space values is no longer finite
+ 3. The y-orientation (index 2) in the state is **not** in the range `[0.2, 1.0]`
+
+ ### Arguments
+
+ No additional arguments are currently supported (in v2 and lower), but modifications
+ can be made to the XML file in the assets folder (or by changing the path to a modified
+ XML file in another folder).
+
+ ```
+ env = gym.make('Ant-v2')
+ ```
+
+ v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
+
+ ```
+ env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ...)
+ ```
+
+ ### Version History
+ * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3
+ * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+ * v2: All continuous control environments now use mujoco_py >= 1.50
+ * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
+ * v0: Initial versions release (1.0.0)
+ """
+
+ def __init__(
+ self,
+ xml_file="ant.xml",
+ ctrl_cost_weight=0.5,
+ use_contact_forces=False,
+ contact_cost_weight=5e-4,
+ healthy_reward=1.0,
+ terminate_when_unhealthy=True,
+ healthy_z_range=(0.2, 1.0),
+ contact_force_range=(-1.0, 1.0),
+ reset_noise_scale=0.1,
+ exclude_current_positions_from_observation=True,
+ ):
+ utils.EzPickle.__init__(**locals())
+
+ self._ctrl_cost_weight = ctrl_cost_weight
+ self._contact_cost_weight = contact_cost_weight
+
+ self._healthy_reward = healthy_reward
+ self._terminate_when_unhealthy = terminate_when_unhealthy
+ self._healthy_z_range = healthy_z_range
+
+ self._contact_force_range = contact_force_range
+
+ self._reset_noise_scale = reset_noise_scale
+
+ self._use_contact_forces = use_contact_forces
+
+ self._exclude_current_positions_from_observation = (
+ exclude_current_positions_from_observation
+ )
+
+ mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
+
+ @property
+ def healthy_reward(self):
+ return (
+ float(self.is_healthy or self._terminate_when_unhealthy)
+ * self._healthy_reward
+ )
+
+ def control_cost(self, action):
+ control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
+ return control_cost
+
+ @property
+ def contact_forces(self):
+ raw_contact_forces = self.data.cfrc_ext
+ min_value, max_value = self._contact_force_range
+ contact_forces = np.clip(raw_contact_forces, min_value, max_value)
+ return contact_forces
+
+ @property
+ def contact_cost(self):
+ contact_cost = self._contact_cost_weight * np.sum(
+ np.square(self.contact_forces)
+ )
+ return contact_cost
+
+ @property
+ def is_healthy(self):
+ state = self.state_vector()
+ min_z, max_z = self._healthy_z_range
+ is_healthy = np.isfinite(state).all() and min_z <= state[2] <= max_z
+ return is_healthy
+
+ @property
+ def done(self):
+ done = not self.is_healthy if self._terminate_when_unhealthy else False
+ return done
+
+ def step(self, action):
+ xy_position_before = self.get_body_com("torso")[:2].copy()
+ self.do_simulation(action, self.frame_skip)
+ xy_position_after = self.get_body_com("torso")[:2].copy()
+
+ xy_velocity = (xy_position_after - xy_position_before) / self.dt
+ x_velocity, y_velocity = xy_velocity
+
+ forward_reward = x_velocity
+ healthy_reward = self.healthy_reward
+
+ rewards = forward_reward + healthy_reward
+
+ costs = ctrl_cost = self.control_cost(action)
+
+ done = self.done
+ observation = self._get_obs()
+ info = {
+ "reward_forward": forward_reward,
+ "reward_ctrl": -ctrl_cost,
+ "reward_survive": healthy_reward,
+ "x_position": xy_position_after[0],
+ "y_position": xy_position_after[1],
+ "distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
+ "x_velocity": x_velocity,
+ "y_velocity": y_velocity,
+ "forward_reward": forward_reward,
+ }
+ if self._use_contact_forces:
+ contact_cost = self.contact_cost
+ costs += contact_cost
+ info["reward_ctrl"] = -contact_cost
+
+ reward = rewards - costs
+
+ return observation, reward, done, info
+
+ def _get_obs(self):
+ position = self.data.qpos.flat.copy()
+ velocity = self.data.qvel.flat.copy()
+
+ if self._exclude_current_positions_from_observation:
+ position = position[2:]
+
+ if self._use_contact_forces:
+ contact_force = self.contact_forces.flat.copy()
+ return np.concatenate((position, velocity, contact_force))
+ else:
+ return np.concatenate((position, velocity))
+
+ def reset_model(self):
+ noise_low = -self._reset_noise_scale
+ noise_high = self._reset_noise_scale
+
+ qpos = self.init_qpos + self.np_random.uniform(
+ low=noise_low, high=noise_high, size=self.model.nq
+ )
+ qvel = (
+ self.init_qvel
+ + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv)
+ )
+ self.set_state(qpos, qvel)
+
+ observation = self._get_obs()
+
+ return observation
+
+ def viewer_setup(self):
+ for key, value in DEFAULT_CAMERA_CONFIG.items():
+ if isinstance(value, np.ndarray):
+ getattr(self.viewer.cam, key)[:] = value
+ else:
+ setattr(self.viewer.cam, key, value)
diff --git a/gym/envs/mujoco/assets/inverted_pendulum.xml b/gym/envs/mujoco/assets/inverted_pendulum.xml
index 396a0b3499a..85145042985 100644
--- a/gym/envs/mujoco/assets/inverted_pendulum.xml
+++ b/gym/envs/mujoco/assets/inverted_pendulum.xml
@@ -22,6 +22,6 @@