facebookresearch · akshararai · Feb 10, 2023 · Dec 24, 2022 · Dec 26, 2022 · Dec 27, 2022
diff --git a/habitat-baselines/habitat_baselines/README.md b/habitat-baselines/habitat_baselines/README.md
@@ -53,6 +53,32 @@ To use them download pre-trained pytorch models from [link](https://dl.fbaipubli
 The `habitat_baselines/config/pointnav/ppo_pointnav.yaml` config has better hyperparameters for large scale training and loads the [Gibson PointGoal Navigation Dataset](/README.md#datasets) instead of the test scenes.
 Change the `/benchmark/nav/pointnav: pointnav_gibson` in `habitat_baselines/config/pointnav/ppo_pointnav.yaml` to `/benchmark/nav/pointnav: pointnav_mp3d` in the defaults list for training on [MatterPort3D PointGoal Navigation Dataset](/README.md#datasets).
 
+### Hierarchical Reinforcement Learning (HRL)
+
+We provide a two-layer hierarchical policy class, consisting of a low-level skill that moves the robot, and a high-level policy that reasons about which low-level skill to use in the current state. This can be especially powerful in long-horizon mobile manipulation tasks, like those introduced in [Habitat2.0](https://arxiv.org/abs/2106.14405). Both the low- and high- level can be either learned or an oracle. For oracle high-level we use [PDDL](https://planning.wiki/guide/whatis/pddl), and for oracle low-level we use instantaneous transitions, with the environment set to the final desired state. Additionally, for navigation, we provide an oracle navigation skill that uses A-star and the map of the environment to move the robot to its goal.
+
+To run the following examples, you need the [ReplicaCAD dataset](https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md#replicacad).
+
+To train a high-level policy, while using pre-learned low-level skills (SRL baseline from [Habitat2.0](https://arxiv.org/abs/2106.14405)), you can run:
+
+```bash
+python -u habitat-baselines/habitat_baselines/run.py \
+  --exp-config habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml \
+  --run-type train
+```
+To run a rearrangement episode with oracle low-level skills and a fixed task planner, run:
+
+```bash
+python -u habitat-baselines/habitat_baselines/run.py \
+  --exp-config habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml \
+  --run-type eval \
+  habitat_baselines/rl/policy=hl_fixed \
+  habitat_baselines/rl/policy/hierarchical_policy/defined_skills=oracle_skills
+```
+
+To change the task (like set table) that you train your skills on, you can change the line `/habitat/task/rearrange: rearrange_easy` to `/habitat/task/rearrange: set_table` in the defaults of your config.
+
+
 ### Classic
 
 **SLAM based**

diff --git a/habitat-baselines/habitat_baselines/agents/ppo_agents.py b/habitat-baselines/habitat_baselines/agents/ppo_agents.py
@@ -126,23 +126,19 @@ def reset(self) -> None:
     def act(self, observations: Observations) -> Dict[str, int]:
         batch = batch_obs([observations], device=self.device)
         with torch.no_grad():
-            (
-                _,
-                actions,
-                _,
-                self.test_recurrent_hidden_states,
-            ) = self.actor_critic.act(
+            action_data = self.actor_critic.act(
                 batch,
                 self.test_recurrent_hidden_states,
                 self.prev_actions,
                 self.not_done_masks,
                 deterministic=False,
             )
+            self.test_recurrent_hidden_states = action_data.rnn_hidden_states
             #  Make masks not done till reset (end of episode) will be called
             self.not_done_masks.fill_(True)
-            self.prev_actions.copy_(actions)  # type: ignore
+            self.prev_actions.copy_(action_data.actions)  # type: ignore
 
-        return {"action": actions[0][0].item()}
+        return {"action": action_data.env_actions[0][0].item()}
 
 
 def main():

diff --git a/habitat-baselines/habitat_baselines/common/baseline_registry.py b/habitat-baselines/habitat_baselines/common/baseline_registry.py
@@ -136,5 +136,30 @@ def register_auxiliary_loss(
     def get_auxiliary_loss(cls, name: str):
         return cls._get_impl("aux_loss", name)
 
+    @classmethod
+    def register_storage(cls, to_register=None, *, name: Optional[str] = None):
+        """
+        Registers data storage for storing data in the policy rollout in the
+        trainer and then for fetching data batches for the updater.
+        """
+
+        return cls._register_impl("storage", to_register, name)
+
+    @classmethod
+    def get_storage(cls, name: str):
+        return cls._get_impl("storage", name)
+
+    @classmethod
+    def register_updater(cls, to_register=None, *, name: Optional[str] = None):
+        """
+        Registers a policy updater.
+        """
+
+        return cls._register_impl("updater", to_register, name)
+
+    @classmethod
+    def get_updater(cls, name: str):
+        return cls._get_impl("updater", name)
+
 
 baseline_registry = BaselineRegistry()
diff --git a/habitat-baselines/habitat_baselines/common/rollout_storage.py b/habitat-baselines/habitat_baselines/common/rollout_storage.py
@@ -5,18 +5,24 @@
 # LICENSE file in the root directory of this source tree.
 
 import warnings
-from typing import Any, Dict, Iterator, Optional, Tuple
+from typing import Any, Dict, Iterator, Optional
 
 import numpy as np
 import torch
 
+from habitat_baselines.common.baseline_registry import baseline_registry
 from habitat_baselines.common.tensor_dict import DictTree, TensorDict
 from habitat_baselines.rl.models.rnn_state_encoder import (
     build_pack_info_from_dones,
     build_rnn_build_seq_info,
 )
+from habitat_baselines.utils.common import (
+    get_num_actions,
+    is_continuous_action_space,
+)
 
 
+@baseline_registry.register_storage
 class RolloutStorage:
     r"""Class for storing rollout information for RL trainers."""
 
@@ -28,10 +34,21 @@ def __init__(
         action_space,
         recurrent_hidden_state_size,
         num_recurrent_layers=1,
-        action_shape: Optional[Tuple[int]] = None,
         is_double_buffered: bool = False,
-        discrete_actions: bool = True,
     ):
+        if is_continuous_action_space(action_space):
+            # Assume ALL actions are NOT discrete
+            action_shape = (
+                get_num_actions(
+                    action_space,
+                ),
+            )
+            discrete_actions = False
+        else:
+            # For discrete pointnav
+            action_shape = (1,)
+            discrete_actions = True
+
         self.buffers = TensorDict()
         self.buffers["observations"] = TensorDict()
 
@@ -115,6 +132,7 @@ def insert(
         rewards=None,
         next_masks=None,
         buffer_index: int = 0,
+        **kwargs,
     ):
         if not self.is_double_buffered:
             assert buffer_index == 0

diff --git a/habitat-baselines/habitat_baselines/config/default_structured_configs.py b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 from hydra.core.config_store import ConfigStore
 from omegaconf import MISSING
@@ -224,10 +224,45 @@ class Eq2CubeConfig(ObsTransformConfig):
 
 
 @dataclass
-class HierarchicalPolicy(HabitatBaselinesBaseConfig):
+class HrlDefinedSkillConfig(HabitatBaselinesBaseConfig):
+    """
+    Defines a low-level skill to be used in the hierarchical policy.
+    """
+
+    skill_name: str = MISSING
+    name: str = "PointNavResNetPolicy"
+    action_distribution_type: str = "gaussian"
+    load_ckpt_file: str = ""
+    max_skill_steps: int = 200
+    # If true, the stop action will be called if the skill times out.
+    force_end_on_timeout: bool = True
+    # Overrides the config file of a neural network skill rather than loading
+    # the config file from the checkpoint file.
+    force_config_file: str = ""
+    at_resting_threshold: float = 0.15
+    # If true, this willapply the post-conditions of the skill after it
+    # terminates.
+    apply_postconds: bool = False
+    obs_skill_inputs: List[str] = field(default_factory=list)
+    obs_skill_input_dim: int = 3
+    start_zone_radius: float = 0.3
+    # For the oracle navigation skill
+    action_name: str = "base_velocity"
+    stop_thresh: float = 0.001
+    # For the reset_arm_skill
+    reset_joint_state: List[float] = MISSING
+    # The set of PDDL action names (as defined in the PDDL domain file) that
+    # map to this skill. If not specified,the name of the skill must match the
+    # PDDL action name.
+    pddl_action_names: Optional[List[str]] = None
+
+
+@dataclass
+class HierarchicalPolicyConfig(HabitatBaselinesBaseConfig):
     high_level_policy: Dict[str, Any] = MISSING
-    defined_skills: Dict[str, Any] = field(default_factory=dict)
-    use_skills: Dict[str, str] = field(default_factory=dict)
+    defined_skills: Dict[str, HrlDefinedSkillConfig] = field(
+        default_factory=dict
+    )
 
 
 @dataclass
@@ -238,7 +273,7 @@ class PolicyConfig(HabitatBaselinesBaseConfig):
     # For gaussian action distribution:
     action_dist: ActionDistributionConfig = ActionDistributionConfig()
     obs_transforms: Dict[str, ObsTransformConfig] = field(default_factory=dict)
-    hierarchical_policy: HierarchicalPolicy = MISSING
+    hierarchical_policy: HierarchicalPolicyConfig = MISSING
 
 
 @dataclass
@@ -345,6 +380,8 @@ class HabitatBaselinesConfig(HabitatBaselinesBaseConfig):
     # )
     # cmd_trailing_opts: List[str] = field(default_factory=list)
     trainer_name: str = "ppo"
+    updater_name: str = "PPO"
+    distrib_updater_name: str = "DDPPO"
     torch_gpu_id: int = 0
     tensorboard_dir: str = "tb"
     writer_type: str = "tb"
@@ -355,6 +392,7 @@ class HabitatBaselinesConfig(HabitatBaselinesBaseConfig):
     eval_ckpt_path_dir: str = "data/checkpoints"
     num_environments: int = 16
     num_processes: int = -1  # deprecated
+    rollout_storage: str = "RolloutStorage"
     checkpoint_folder: str = "data/checkpoints"
     num_updates: int = 10000
     num_checkpoints: int = 10

diff --git a/...ines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/nn_skills.yaml b/...ines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/nn_skills.yaml
@@ -0,0 +1,44 @@
+open_cab:
+  skill_name: "ArtObjSkillPolicy"
+  load_ckpt_file: "data/models/open_cab.pth"
+
+open_fridge:
+  skill_name: "ArtObjSkillPolicy"
+  load_ckpt_file: "data/models/open_fridge.pth"
+
+close_cab:
+  skill_name: "ArtObjSkillPolicy"
+  load_ckpt_file: "data/models/close_cab.pth"
+
+close_fridge:
+  skill_name: "ArtObjSkillPolicy"
+  load_ckpt_file: "data/models/close_fridge.pth"
+
+pick:
+  skill_name: "PickSkillPolicy"
+  obs_skill_inputs: ["obj_start_sensor"]
+  load_ckpt_file: "data/models/pick.pth"
+
+place:
+  skill_name: "PlaceSkillPolicy"
+  obs_skill_inputs: ["obj_goal_sensor"]
+  load_ckpt_file: "data/models/place.pth"
+
+wait:
+  skill_name: "WaitSkillPolicy"
+  max_skill_steps: -1
+  force_end_on_timeout: False
+
+nav_to_obj:
+  skill_name: "NavSkillPolicy"
+  obs_skill_inputs: ["goal_to_agent_gps_compass"]
+  load_ckpt_file: "data/models/nav.pth"
+  max_skill_steps: 300
+  obs_skill_input_dim: 2
+  pddl_action_names: ["nav", "nav_to_receptacle"]
+
+reset_arm:
+  skill_name: "ResetArmSkill"
+  max_skill_steps: 50
+  reset_joint_state: [-4.50e-01, -1.08e00, 9.95e-02, 9.38e-01, -7.88e-04, 1.57e00, 4.62e-03]
+  force_end_on_timeout: False
diff --git a/.../config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml b/.../config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
@@ -0,0 +1,68 @@
+# Oracle skills that will teleport to the skill post-condition. When automatically setting predicates you may want to run the simulation in kinematic mode:
+# To run in kinematic mode, add: `habitat.simulator.kinematic_mode=True habitat.simulator.ac_freq_ratio=1 habitat.task.measurements.force_terminate.max_accum_force=-1.0 habitat.task.measurements.force_terminate.max_instant_force=-1.0`
+
+defaults:
+  - /habitat/task/actions:
+    - pddl_apply_action
+
+open_cab:
+  skill_name: "NoopSkillPolicy"
+  max_skill_steps: 1
+  apply_postconds: True
+  force_end_on_timeout: False
+  pddl_action_names: ["open_cab_by_name"]
+
+open_fridge:
+  skill_name: "NoopSkillPolicy"
+  max_skill_steps: 1
+  apply_postconds: True
+  force_end_on_timeout: False
+  pddl_action_names: ["open_fridge_by_name"]
+
+close_cab:
+  skill_name: "NoopSkillPolicy"
+  obs_skill_inputs: ["obj_start_sensor"]
+  max_skill_steps: 1
+  force_end_on_timeout: False
+  pddl_action_names: ["close_cab_by_name"]
+
+close_fridge:
+  skill_name: "NoopSkillPolicy"
+  obs_skill_inputs: ["obj_start_sensor"]
+  max_skill_steps: 1
+  apply_postconds: True
+  force_end_on_timeout: False
+  pddl_action_names: ["close_fridge_by_name"]
+
+pick:
+  skill_name: "NoopSkillPolicy"
+  obs_skill_inputs: ["obj_start_sensor"]
+  max_skill_steps: 1
+  apply_postconds: True
+  force_end_on_timeout: False
+
+place:
+  skill_name: "NoopSkillPolicy"
+  obs_skill_inputs: ["obj_goal_sensor"]
+  max_skill_steps: 1
+  apply_postconds: True
+  force_end_on_timeout: False
+
+wait:
+  skill_name: "WaitSkillPolicy"
+  max_skill_steps: -1
+
+nav_to_obj:
+  skill_name: "NoopSkillPolicy"
+  obs_skill_inputs: ["goal_to_agent_gps_compass"]
+  max_skill_steps: 1
+  apply_postconds: True
+  force_end_on_timeout: False
+  obs_skill_input_dim: 2
+  pddl_action_names: ["nav", "nav_to_receptacle_by_name"]
+
+reset_arm:
+  skill_name: "ResetArmSkill"
+  max_skill_steps: 50
+  reset_joint_state: [-4.50e-01, -1.07e00, 9.95e-02, 9.38e-01, -7.88e-04, 1.57e00, 4.62e-03]
+  force_end_on_timeout: False
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_fixed.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_fixed.yaml
@@ -0,0 +1,10 @@
+name: "HierarchicalPolicy"
+obs_transforms:
+  add_virtual_keys:
+    virtual_keys:
+      "goal_to_agent_gps_compass": 2
+hierarchical_policy:
+  high_level_policy:
+    name: "FixedHighLevelPolicy"
+    add_arm_rest: True
+  defined_skills: {}
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
@@ -0,0 +1,27 @@
+name: "HierarchicalPolicy"
+obs_transforms:
+  add_virtual_keys:
+    virtual_keys:
+      "goal_to_agent_gps_compass": 2
+hierarchical_policy:
+  high_level_policy:
+    name: "NeuralHighLevelPolicy"
+    allowed_actions:
+      - nav
+      - pick
+      - place
+      - nav_to_receptacle_by_name
+      - open_fridge_by_name
+      - close_fridge_by_name
+      - open_cab_by_name
+      - close_cab_by_name
+    allow_other_place: False
+    hidden_dim: 512
+    use_rnn: True
+    rnn_type: 'LSTM'
+    backbone: resnet18
+    normalize_visual_inputs: False
+    num_rnn_layers: 2
+    policy_input_keys:
+      - "robot_head_depth"
+  defined_skills: {}
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/monolithic.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/monolithic.yaml
@@ -0,0 +1,4 @@
+name: "PointNavResNetPolicy"
+action_distribution_type: "gaussian"
+action_dist:
+  use_log_std: True