From a787e3416956e2bda1f731045275979f21b0444e Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Fri, 23 Dec 2022 16:09:50 -0800
Subject: [PATCH 01/48] Trainable HL policy

---
 .../config/default_structured_configs.py      |  23 ++-
 .../config/rearrange/rl_hl_srl_onav.yaml      | 176 ++++++++++++++++++
 .../config/rearrange/tp_srl.yaml              | 100 ++--------
 .../config/rearrange/tp_srl_oracle_nav.yaml   | 129 ++++---------
 .../config/tp_srl_test/tp_srl_test.yaml       |  99 ----------
 .../rl/hrl/hierarchical_policy.py             |  89 +++++++--
 .../rl/hrl/high_level_policy.py               |  81 --------
 .../habitat_baselines/rl/hrl/hl/__init__.py   |   3 +-
 .../rl/hrl/hl/fixed_policy.py                 |  67 ++-----
 .../rl/hrl/hl/high_level_policy.py            |  84 ++++++++-
 .../rl/hrl/hl/neural_policy.py                | 175 +++++++++++++++++
 .../rl/hrl/skills/__init__.py                 |   2 +
 .../rl/hrl/skills/nn_skill.py                 |  13 +-
 .../habitat_baselines/rl/hrl/skills/noop.py   |  39 ++++
 .../rl/hrl/skills/oracle_nav.py               |   7 +-
 .../habitat_baselines/rl/hrl/skills/pick.py   |   9 +-
 .../habitat_baselines/rl/hrl/skills/reset.py  |   5 +-
 .../habitat_baselines/rl/hrl/skills/skill.py  |  99 ++++++++--
 .../habitat_baselines/rl/hrl/skills/wait.py   |   5 +-
 .../habitat_baselines/rl/hrl/utils.py         |   2 +-
 .../habitat_baselines/rl/ppo/policy.py        |  37 +++-
 .../habitat_baselines/rl/ppo/ppo_trainer.py   |  40 ++--
 .../rl/ver/inference_worker.py                |   1 +
 .../tasks/rearrange/multi_task/pddl_action.py |   2 +-
 test/test_baseline_training.py                |  58 ++++++
 test/test_rearrange_task.py                   |  62 ------
 26 files changed, 861 insertions(+), 546 deletions(-)
 create mode 100644 habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml
 delete mode 100644 habitat-baselines/habitat_baselines/config/tp_srl_test/tp_srl_test.yaml
 delete mode 100644 habitat-baselines/habitat_baselines/rl/hrl/high_level_policy.py
 create mode 100644 habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
 create mode 100644 habitat-baselines/habitat_baselines/rl/hrl/skills/noop.py

diff --git a/habitat-baselines/habitat_baselines/config/default_structured_configs.py b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
index 46275ea0d0..56a51d6c82 100644
--- a/habitat-baselines/habitat_baselines/config/default_structured_configs.py
+++ b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
@@ -219,10 +219,31 @@ class Eq2CubeConfig(ObsTransformConfig):
 )
 
 
+@dataclass
+class HrlDefinedSkill(HabitatBaselinesBaseConfig):
+    skill_name: str = MISSING
+    name: str = "PointNavResNetPolicy"
+    action_distribution_type: str = "gaussian"
+    load_ckpt_file: str = ""
+    max_skill_steps: int = 200
+    force_end_on_timeout: bool = True
+    force_config_file: str = ""
+    at_resting_threshold: float = 0.15
+    apply_postconds: bool = False
+    obs_skill_inputs: List[str] = field(default_factory=list)
+    obs_skill_input_dim: int = 3
+    start_zone_radius: float = 0.3
+    # For the oracle navigation skill
+    nav_action_name: str = "base_velocity"
+    stop_thresh: float = 0.001
+    # For the reset_arm_skill
+    reset_joint_state: List[float] = MISSING
+
+
 @dataclass
 class HierarchicalPolicy(HabitatBaselinesBaseConfig):
     high_level_policy: Dict[str, Any] = MISSING
-    defined_skills: Dict[str, Any] = field(default_factory=dict)
+    defined_skills: Dict[str, HrlDefinedSkill] = field(default_factory=dict)
     use_skills: Dict[str, str] = field(default_factory=dict)
 
 
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml
new file mode 100644
index 0000000000..3f542bbf09
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml
@@ -0,0 +1,176 @@
+# @package _global_
+# Pick and place are kinematically simulated.
+# Navigation is fully simulated.
+
+defaults:
+  - /benchmark/rearrange: rearrange_easy
+  - /habitat_baselines: habitat_baselines_rl_config_base
+  - /habitat_baselines/rl/policy/obs_transforms:
+    - add_virtual_keys_base
+  - /habitat/task/actions:
+    - oracle_nav_action
+    - arm_action
+    - base_velocity
+    - rearrange_stop
+  - _self_
+
+habitat_baselines:
+  verbose: False
+  trainer_name: "ddppo"
+  torch_gpu_id: 0
+  tensorboard_dir: "tb"
+  video_dir: "video_dir"
+  video_fps: 30
+  video_render_views:
+    - "third_rgb_sensor"
+  test_episode_count: -1
+  eval_ckpt_path_dir: ""
+  num_environments: 1
+  writer_type: 'tb'
+  checkpoint_folder: "data/new_checkpoints"
+  num_updates: -1
+  total_num_steps: 1.0e8
+  log_interval: 10
+  num_checkpoints: 20
+  force_torch_single_threaded: True
+  eval_keys_to_include_in_name: ['reward', 'force', 'composite_success']
+  load_resume_state_config: False
+
+  eval:
+    use_ckpt_config: False
+    should_load_ckpt: False
+    video_option: ["disk"]
+
+  rl:
+    policy:
+        name: "HierarchicalPolicy"
+        obs_transforms:
+          add_virtual_keys:
+            virtual_keys:
+              "goal_to_agent_gps_compass": 2
+        hierarchical_policy:
+          high_level_policy:
+            name: "NeuralHighLevelPolicy"
+            allow_other_place: False
+            hidden_dim: 512
+            use_rnn: True
+            rnn_type: 'LSTM'
+            backbone: resnet18
+            normalize_visual_inputs: False
+            num_rnn_layers: 2
+            policy_input_keys:
+              - "robot_head_depth"
+          defined_skills:
+            open_cab:
+              skill_name: "ArtObjSkillPolicy"
+              name: "PointNavResNetPolicy"
+              action_distribution_type: "gaussian"
+              load_ckpt_file: "data/models/open_cab.pth"
+              max_skill_steps: 200
+              apply_postconds: True
+
+            open_fridge:
+              skill_name: "ArtObjSkillPolicy"
+              load_ckpt_file: "data/models/open_fridge.pth"
+              max_skill_steps: 200
+              apply_postconds: True
+
+            close_cab:
+              skill_name: "NoopSkillPolicy"
+              obs_skill_inputs: ["obj_start_sensor"]
+              load_ckpt_file: "data/models/close_cab.pth"
+              max_skill_steps: 200
+
+            close_fridge:
+              skill_name: "NoopSkillPolicy"
+              obs_skill_inputs: ["obj_start_sensor"]
+              max_skill_steps: 1
+              apply_postconds: True
+
+            pick:
+              skill_name: "NoopSkillPolicy"
+              obs_skill_inputs: ["obj_start_sensor"]
+              max_skill_steps: 1
+              apply_postconds: True
+
+            place:
+              skill_name: "NoopSkillPolicy"
+              obs_skill_inputs: ["obj_goal_sensor"]
+              max_skill_steps: 1
+              apply_postconds: True
+
+            nav_to_obj:
+              skill_name: "OracleNavPolicy"
+              obs_skill_inputs: ["obj_start_sensor", "abs_obj_start_sensor", "obj_goal_sensor", "abs_obj_goal_sensor"]
+              max_skill_steps: 300
+
+            wait_skill:
+              skill_name: "WaitSkillPolicy"
+              max_skill_steps: -1
+              force_end_on_timeout: False
+
+            reset_arm_skill:
+              skill_name: "ResetArmSkill"
+              max_skill_steps: 50
+              reset_joint_state: [-4.5003259e-01, -1.0799699e00, 9.9526465e-02, 9.3869519e-01, -7.8854430e-04, 1.5702540e00, 4.6168058e-03]
+              force_end_on_timeout: False
+
+          use_skills:
+            # Uncomment if you are also using these skills
+            # open_cab: "open_cab"
+            # open_fridge: "open_fridge"
+            # close_cab: "open_cab"
+            # close_fridge: "open_fridge"
+            pick: "pick"
+            place: "place"
+            nav: "nav_to_obj"
+            nav_to_receptacle: "nav_to_obj"
+            wait: "wait_skill"
+            reset_arm: "reset_arm_skill"
+
+    ppo:
+      # ppo params
+      clip_param: 0.2
+      ppo_epoch: 2
+      num_mini_batch: 2
+      value_loss_coef: 0.5
+      entropy_coef: 0.0001
+      lr: 2.5e-4
+      eps: 1e-5
+      max_grad_norm: 0.2
+      num_steps: 128
+      use_gae: True
+      gamma: 0.99
+      tau: 0.95
+      use_linear_clip_decay: False
+      use_linear_lr_decay: False
+      reward_window_size: 50
+
+      use_normalized_advantage: False
+
+      hidden_size: 512
+
+      # Use double buffered sampling, typically helps
+      # when environment time is similar or larger than
+      # policy inference time during rollout generation
+      use_double_buffered_sampler: False
+
+    ddppo:
+      sync_frac: 0.6
+      # The PyTorch distributed backend to use
+      distrib_backend: NCCL
+      # Visual encoder backbone
+      pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth
+      # Initialize with pretrained weights
+      pretrained: False
+      # Initialize just the visual encoder backbone with pretrained weights
+      pretrained_encoder: False
+      # Whether the visual encoder backbone will be trained.
+      train_encoder: True
+      # Whether to reset the critic linear layer
+      reset_critic: False
+
+      # Model parameters
+      backbone: resnet18
+      rnn_type: LSTM
+      num_recurrent_layers: 2
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml b/habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml
index 3524335f21..3f2fe45fb5 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml
@@ -46,51 +46,30 @@ habitat_baselines:
             name: "FixedHighLevelPolicy"
             add_arm_rest: True
           defined_skills:
-            nn_open_cab:
+            open_cab:
               skill_name: "ArtObjSkillPolicy"
               name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              at_resting_threshold: 0.15
-              obs_skill_inputs: []
               load_ckpt_file: "data/models/open_cab.pth"
               max_skill_steps: 200
-              start_zone_radius: 0.3
-              force_end_on_timeout: True
 
-            nn_open_fridge:
+            open_fridge:
               skill_name: "ArtObjSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              at_resting_threshold: 0.15
-              obs_skill_inputs: []
               load_ckpt_file: "data/models/open_fridge.pth"
               max_skill_steps: 200
-              start_zone_radius: 0.3
-              force_end_on_timeout: True
 
-            nn_close_cab:
+            close_cab:
               skill_name: "ArtObjSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              at_resting_threshold: 0.2
               obs_skill_inputs: ["obj_start_sensor"]
               load_ckpt_file: "data/models/close_cab.pth"
               max_skill_steps: 200
-              start_zone_radius: 0.3
-              force_end_on_timeout: True
 
-            nn_close_fridge:
+            close_fridge:
               skill_name: "ArtObjSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              at_resting_threshold: 0.2
               obs_skill_inputs: ["obj_start_sensor"]
               load_ckpt_file: "data/models/close_fridge.pth"
               max_skill_steps: 200
-              start_zone_radius: 0.3
-              force_end_on_timeout: True
 
-            nn_pick:
+            pick:
               skill_name: "PickSkillPolicy"
               name: "PointNavResNetPolicy"
               action_distribution_type: "gaussian"
@@ -98,9 +77,8 @@ habitat_baselines:
               obs_skill_inputs: ["obj_start_sensor"]
               load_ckpt_file: "data/models/pick.pth"
               max_skill_steps: 200
-              force_end_on_timeout: True
 
-            nn_place:
+            place:
               skill_name: "PlaceSkillPolicy"
               name: "PointNavResNetPolicy"
               action_distribution_type: "gaussian"
@@ -108,23 +86,18 @@ habitat_baselines:
               obs_skill_inputs: ["obj_goal_sensor"]
               load_ckpt_file: "data/models/place.pth"
               max_skill_steps: 200
-              force_end_on_timeout: True
 
-            nn_nav:
+            nav_to_obj:
               skill_name: "NavSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
               obs_skill_inputs: ["goal_to_agent_gps_compass"]
               obs_skill_input_dim: 2
-              lin_speed_stop: 0.067
-              ang_speed_stop: 0.067
               load_ckpt_file: "data/models/nav.pth"
               max_skill_steps: 300
               force_end_on_timeout: False
 
             wait_skill:
               skill_name: "WaitSkillPolicy"
-              max_skill_steps: -1.0
+              max_skill_steps: -1
               force_end_on_timeout: False
 
             reset_arm_skill:
@@ -135,60 +108,19 @@ habitat_baselines:
 
           use_skills:
             # Uncomment if you are also using these skills
-            # open_cab: "NN_OPEN_CAB"
-            # open_fridge: "NN_OPEN_FRIDGE"
-            # close_cab: "NN_OPEN_CAB"
-            # close_fridge: "NN_OPEN_FRIDGE"
-            pick: "nn_pick"
-            place: "nn_place"
-            nav: "nn_nav"
-            nav_to_receptacle: "nn_nav"
+            # open_cab: "open_cab"
+            # open_fridge: "open_fridge"
+            # close_cab: "open_cab"
+            # close_fridgE: "open_fridge"
+            pick: "pick"
+            place: "place"
+            nav: "nav_to_obj"
+            nav_to_receptacle: "nav_to_obj"
             wait: "wait_skill"
             reset_arm: "reset_arm_skill"
 
-    ppo:
-      # ppo params
-      clip_param: 0.2
-      ppo_epoch: 2
-      num_mini_batch: 2
-      value_loss_coef: 0.5
-      entropy_coef: 0.0001
-      lr: 2.5e-4
-      eps: 1e-5
-      max_grad_norm: 0.2
-      num_steps: 128
-      use_gae: True
-      gamma: 0.99
-      tau: 0.95
-      use_linear_clip_decay: False
-      use_linear_lr_decay: False
-      reward_window_size: 50
-
-      use_normalized_advantage: False
-
-      hidden_size: 512
-
-      # Use double buffered sampling, typically helps
-      # when environment time is similar or larger than
-      # policy inference time during rollout generation
-      use_double_buffered_sampler: False
-
     ddppo:
-      sync_frac: 0.6
-      # The PyTorch distributed backend to use
-      distrib_backend: NCCL
-      # Visual encoder backbone
-      pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth
-      # Initialize with pretrained weights
       pretrained: False
-      # Initialize just the visual encoder backbone with pretrained weights
       pretrained_encoder: False
-      # Whether the visual encoder backbone will be trained.
       train_encoder: True
-      # Whether to reset the critic linear layer
       reset_critic: False
-
-      # Model parameters
-      backbone: resnet18
-      rnn_type: LSTM
-      num_recurrent_layers: 2
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml b/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml
index 42f5524234..5e8b59a944 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml
@@ -6,12 +6,27 @@ defaults:
   - /habitat_baselines/rl/policy/obs_transforms:
     - add_virtual_keys_base
   - /habitat/task/actions:
+    - oracle_nav_action
     - arm_action
     - base_velocity
     - rearrange_stop
-    - oracle_nav_action
   - _self_
 
+AGENT_0_ORACLE_NAV_ACTION:
+          TYPE: "OracleNavAction"
+          TURN_VELOCITY: 1.0
+          FORWARD_VELOCITY: 1.0
+          TURN_THRESH: 0.1
+          DIST_THRESH: 0.2
+          AGENT: 0
+          LIN_SPEED: 40.0
+          ANG_SPEED: 20.0
+          MIN_ABS_LIN_SPEED: 1.0
+          MIN_ABS_ANG_SPEED: 1.0
+          ALLOW_DYN_SLIDE: False
+          END_ON_STOP: False
+          ALLOW_BACK: True
+
 habitat_baselines:
   verbose: False
   trainer_name: "ddppo"
@@ -32,10 +47,11 @@ habitat_baselines:
   num_checkpoints: 20
   force_torch_single_threaded: True
   eval_keys_to_include_in_name: ['reward', 'force', 'composite_success']
+  load_resume_state_config: False
   eval:
-    video_option: ["disk"]
     use_ckpt_config: False
     should_load_ckpt: False
+    video_option: ["disk"]
 
   rl:
     policy:
@@ -49,51 +65,30 @@ habitat_baselines:
             name: "FixedHighLevelPolicy"
             add_arm_rest: True
           defined_skills:
-            NN_OPEN_CAB:
+            open_cab:
               skill_name: "ArtObjSkillPolicy"
               name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              at_resting_threshold: 0.15
-              obs_skill_inputs: []
               load_ckpt_file: "data/models/open_cab.pth"
               max_skill_steps: 200
-              start_zone_radius: 0.3
-              force_end_on_timeout: True
 
-            NN_OPEN_FRIDGE:
+            open_fridge:
               skill_name: "ArtObjSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              at_resting_threshold: 0.15
-              obs_skill_inputs: []
               load_ckpt_file: "data/models/open_fridge.pth"
               max_skill_steps: 200
-              start_zone_radius: 0.3
-              force_end_on_timeout: True
 
-            NN_CLOSE_CAB:
+            close_cab:
               skill_name: "ArtObjSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              at_resting_threshold: 0.2
               obs_skill_inputs: ["obj_start_sensor"]
               load_ckpt_file: "data/models/close_cab.pth"
               max_skill_steps: 200
-              start_zone_radius: 0.3
-              force_end_on_timeout: True
 
-            NN_CLOSE_FRIDGE:
+            close_fridge:
               skill_name: "ArtObjSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              at_resting_threshold: 0.2
               obs_skill_inputs: ["obj_start_sensor"]
               load_ckpt_file: "data/models/close_fridge.pth"
               max_skill_steps: 200
-              start_zone_radius: 0.3
-              force_end_on_timeout: True
 
-            nn_pick:
+            pick:
               skill_name: "PickSkillPolicy"
               name: "PointNavResNetPolicy"
               action_distribution_type: "gaussian"
@@ -101,19 +96,8 @@ habitat_baselines:
               obs_skill_inputs: ["obj_start_sensor"]
               load_ckpt_file: "data/models/pick.pth"
               max_skill_steps: 200
-              force_end_on_timeout: True
 
-            GT_NAV:
-              skill_name: "OracleNavPolicy"
-              obs_skill_inputs: ["obj_start_sensor", "abs_obj_start_sensor", "obj_goal_sensor", "abs_obj_goal_sensor"]
-              goal_sensors: ["obj_goal_sensor", "abs_obj_goal_sensor"]
-              NAV_ACTION_NAME: "base_velocity"
-              max_skill_steps: 300
-              force_end_on_timeout: True
-              stop_angle_thresh: 0.2
-              stop_dist_thresh: 1.0
-
-            nn_place:
+            place:
               skill_name: "PlaceSkillPolicy"
               name: "PointNavResNetPolicy"
               action_distribution_type: "gaussian"
@@ -121,13 +105,17 @@ habitat_baselines:
               obs_skill_inputs: ["obj_goal_sensor"]
               load_ckpt_file: "data/models/place.pth"
               max_skill_steps: 200
-              force_end_on_timeout: True
 
             wait_skill:
               skill_name: "WaitSkillPolicy"
-              max_skill_steps: -1.0
+              max_skill_steps: -1
               force_end_on_timeout: False
 
+            gt_nav:
+              skill_name: "OracleNavPolicy"
+              obs_skill_inputs: ["obj_start_sensor", "abs_obj_start_sensor", "obj_goal_sensor", "abs_obj_goal_sensor"]
+              max_skill_steps: 300
+
             reset_arm_skill:
               skill_name: "ResetArmSkill"
               max_skill_steps: 50
@@ -136,60 +124,19 @@ habitat_baselines:
 
           use_skills:
             # Uncomment if you are also using these skills
-            # open_cab: "NN_OPEN_CAB"
-            # open_fridge: "NN_OPEN_FRIDGE"
-            # close_cab: "NN_OPEN_CAB"
-            # close_fridge: "NN_OPEN_FRIDGE"
-            pick: "nn_pick"
-            place: "nn_place"
-            nav: "GT_NAV"
-            nav_to_receptacle: "GT_NAV"
+            # open_cab: "open_cab"
+            # open_fridge: "open_fridge"
+            # close_cab: "open_cab"
+            # close_fridge: "open_fridge"
+            pick: "pick"
+            place: "place"
+            nav: "gt_nav"
+            nav_to_receptacle: "gt_nav"
             wait: "wait_skill"
             reset_arm: "reset_arm_skill"
 
-    ppo:
-      # ppo params
-      clip_param: 0.2
-      ppo_epoch: 2
-      num_mini_batch: 2
-      value_loss_coef: 0.5
-      entropy_coef: 0.0001
-      lr: 2.5e-4
-      eps: 1e-5
-      max_grad_norm: 0.2
-      num_steps: 128
-      use_gae: True
-      gamma: 0.99
-      tau: 0.95
-      use_linear_clip_decay: False
-      use_linear_lr_decay: False
-      reward_window_size: 50
-
-      use_normalized_advantage: False
-
-      hidden_size: 512
-
-      # Use double buffered sampling, typically helps
-      # when environment time is similar or larger than
-      # policy inference time during rollout generation
-      use_double_buffered_sampler: False
-
     ddppo:
-      sync_frac: 0.6
-      # The PyTorch distributed backend to use
-      distrib_backend: NCCL
-      # Visual encoder backbone
-      pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth
-      # Initialize with pretrained weights
       pretrained: False
-      # Initialize just the visual encoder backbone with pretrained weights
       pretrained_encoder: False
-      # Whether the visual encoder backbone will be trained.
       train_encoder: True
-      # Whether to reset the critic linear layer
       reset_critic: False
-
-      # Model parameters
-      backbone: resnet18
-      rnn_type: LSTM
-      num_recurrent_layers: 2
diff --git a/habitat-baselines/habitat_baselines/config/tp_srl_test/tp_srl_test.yaml b/habitat-baselines/habitat_baselines/config/tp_srl_test/tp_srl_test.yaml
deleted file mode 100644
index fd413e2749..0000000000
--- a/habitat-baselines/habitat_baselines/config/tp_srl_test/tp_srl_test.yaml
+++ /dev/null
@@ -1,99 +0,0 @@
-# @package _global_
-
-defaults:
-  - /benchmark/rearrange: rearrange_easy
-  - /habitat_baselines: habitat_baselines_rl_config_base
-  - /habitat_baselines/rl/policy/obs_transforms:
-    - add_virtual_keys_base
-
-habitat_baselines:
-  verbose: False
-  trainer_name: "ppo"
-  torch_gpu_id: 0
-  tensorboard_dir: ""
-  video_dir: "data/test_checkpoints/ppo/pointnav/video"
-  test_episode_count: 2
-  eval_ckpt_path_dir: ""
-  num_environments: 1
-  checkpoint_folder: "data/test_checkpoints/ppo/pointnav/"
-  num_updates: 2
-  log_interval: 100
-  num_checkpoints: 2
-  force_torch_single_threaded: True
-  load_resume_state_config: False
-  eval:
-    use_ckpt_config: False
-    should_load_ckpt: False
-
-  rl:
-    policy:
-        name: "HierarchicalPolicy"
-
-        obs_transforms:
-          add_virtual_keys:
-            virtual_keys:
-              "goal_to_agent_gps_compass": 2
-        hierarchical_policy:
-          high_level_policy:
-            name: "FixedHighLevelPolicy"
-            add_arm_rest: True
-          defined_skills:
-            nn_pick:
-              skill_name: "PickSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              at_resting_threshold: 0.15
-              obs_skill_inputs: ["obj_start_sensor"]
-              load_ckpt_file: ""
-              max_skill_steps: 200
-              force_end_on_timeout: True
-              force_config_file: "benchmark/rearrange=pick"
-
-            nn_place:
-              skill_name: "PlaceSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              at_resting_threshold: 0.15
-              obs_skill_inputs: ["obj_goal_sensor"]
-              load_ckpt_file: ""
-              max_skill_steps: 200
-              force_end_on_timeout: True
-              force_config_file: "benchmark/rearrange=place"
-
-            nn_nav:
-              skill_name: "NavSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              obs_skill_inputs: ["goal_to_agent_gps_compass"]
-              obs_skill_input_dim: 2
-              lin_speed_stop: 0.067
-              ang_speed_stop: 0.067
-              load_ckpt_file: ""
-              max_skill_steps: 300
-              force_end_on_timeout: False
-              force_config_file: "benchmark/rearrange=nav_to_obj"
-
-            wait_skill:
-              skill_name: "WaitSkillPolicy"
-              max_skill_steps: -1.0
-              force_end_on_timeout: False
-
-            reset_arm_skill:
-              skill_name: "ResetArmSkill"
-              max_skill_steps: 50
-              reset_joint_state: [-4.5003259e-01, -1.0799699e00, 9.9526465e-02, 9.3869519e-01, -7.8854430e-04, 1.5702540e00, 4.6168058e-03]
-              force_end_on_timeout: False
-
-          use_skills:
-            # Uncomment if you are also using these skills
-            pick: "nn_pick"
-            place: "nn_place"
-            nav: "nn_nav"
-            nav_to_receptacle: "nn_nav"
-            wait: "wait_skill"
-            reset_arm: "reset_arm_skill"
-    ddppo:
-        pretrained: False
-        pretrained_encoder: False
-        train_encoder: True
-        reset_critic: False
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
index e0aa4e7e77..f9217051e4 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
@@ -13,15 +13,18 @@
 from habitat.tasks.rearrange.multi_task.composite_sensors import (
     CompositeSuccess,
 )
+from habitat.tasks.rearrange.multi_task.pddl_domain import PddlProblem
 from habitat_baselines.common.baseline_registry import baseline_registry
 from habitat_baselines.common.logging import baselines_logger
 from habitat_baselines.rl.hrl.hl import (  # noqa: F401.
     FixedHighLevelPolicy,
     HighLevelPolicy,
+    NeuralHighLevelPolicy,
 )
 from habitat_baselines.rl.hrl.skills import (  # noqa: F401.
     ArtObjSkillPolicy,
     NavSkillPolicy,
+    NoopSkillPolicy,
     OracleNavPolicy,
     PickSkillPolicy,
     PlaceSkillPolicy,
@@ -30,7 +33,7 @@
     WaitSkillPolicy,
 )
 from habitat_baselines.rl.hrl.utils import find_action_range
-from habitat_baselines.rl.ppo.policy import Policy
+from habitat_baselines.rl.ppo.policy import Policy, PolicyAction
 from habitat_baselines.utils.common import get_num_actions
 
 
@@ -54,6 +57,18 @@ def __init__(
         self._name_to_idx: Dict[str, int] = {}
         self._idx_to_name: Dict[int, str] = {}
 
+        task_spec_file = osp.join(
+            full_config.habitat.task.task_spec_base_path,
+            full_config.habitat.task.task_spec + ".yaml",
+        )
+        domain_file = full_config.habitat.task.pddl_domain_def
+
+        self._pddl_problem = PddlProblem(
+            domain_file,
+            task_spec_file,
+            config,
+        )
+
         for i, (skill_id, use_skill_name) in enumerate(
             config.hierarchical_policy.use_skills.items()
         ):
@@ -72,6 +87,7 @@ def __init__(
                 self._num_envs,
                 full_config,
             )
+            skill_policy.set_pddl_problem(self._pddl_problem)
             self._skills[i] = skill_policy
             self._name_to_idx[skill_id] = i
             self._idx_to_name[i] = skill_id
@@ -88,12 +104,11 @@ def __init__(
         )
         self._high_level_policy: HighLevelPolicy = high_level_cls(
             config.hierarchical_policy.high_level_policy,
-            osp.join(
-                full_config.habitat.task.task_spec_base_path,
-                full_config.habitat.task.task_spec + ".yaml",
-            ),
+            self._pddl_problem,
             num_envs,
             self._name_to_idx,
+            observation_space,
+            action_space,
         )
         self._stop_action_idx, _ = find_action_range(
             action_space, "rearrange_stop"
@@ -102,35 +117,41 @@ def __init__(
     def eval(self):
         pass
 
-    def get_policy_info(self, infos, dones):
-        policy_infos = []
-        for i, info in enumerate(infos):
+    def extract_policy_info(
+        self, action_data, infos, dones
+    ) -> List[Dict[str, float]]:
+        ret_policy_infos = []
+        for i, (info, policy_info) in enumerate(
+            zip(infos, action_data.policy_info)
+        ):
             cur_skill_idx = self._cur_skills[i].item()
-            policy_info: Dict[str, Any] = {
-                "cur_skill": self._idx_to_name[cur_skill_idx]
+            ret_policy_info: Dict[str, Any] = {
+                "cur_skill": self._idx_to_name[cur_skill_idx],
+                **policy_info,
             }
 
             did_skill_fail = dones[i] and not info[CompositeSuccess.cls_uuid]
             for skill_name, idx in self._name_to_idx.items():
-                policy_info[f"failed_skill_{skill_name}"] = (
+                ret_policy_info[f"failed_skill_{skill_name}"] = (
                     did_skill_fail if idx == cur_skill_idx else 0.0
                 )
-            policy_infos.append(policy_info)
+            ret_policy_infos.append(ret_policy_info)
 
-        return policy_infos
+        return ret_policy_infos
 
     @property
     def num_recurrent_layers(self):
-        return self._skills[0].num_recurrent_layers
+        return self._high_level_policy.num_recurrent_layers
 
     @property
     def should_load_agent_state(self):
         return False
 
     def parameters(self):
-        return self._skills[0].parameters()  # type: ignore[attr-defined]
+        return self._high_level_policy.parameters()
 
     def to(self, device):
+        self._high_level_policy.to(device)
         for skill in self._skills.values():
             skill.to(device)
 
@@ -173,6 +194,7 @@ def act(
         deterministic=False,
     ):
 
+        log_info: List[Dict[str, Any]] = [{} for _ in range(self._num_envs)]
         self._high_level_policy.apply_mask(masks)  # type: ignore[attr-defined]
 
         should_terminate: torch.BoolTensor = torch.zeros(
@@ -182,6 +204,15 @@ def act(
             (self._num_envs,), dtype=torch.bool
         )
 
+        hl_says_term = self._high_level_policy.get_termination(
+            observations,
+            rnn_hidden_states,
+            prev_actions,
+            masks,
+            self._cur_skills,
+            log_info,
+        )
+
         grouped_skills = self._broadcast_skill_ids(
             self._cur_skills,
             sel_dat={
@@ -189,6 +220,7 @@ def act(
                 "rnn_hidden_states": rnn_hidden_states,
                 "prev_actions": prev_actions,
                 "masks": masks,
+                "hl_says_term": hl_says_term,
             },
             # Only decide on skill termination if the episode is active.
             should_adds=masks,
@@ -206,6 +238,11 @@ def act(
             ) = self._skills[skill_id].should_terminate(
                 **dat,
                 batch_idx=batch_ids,
+                log_info=log_info,
+                skill_name=[
+                    self._idx_to_name[self._cur_skills[i].item()]
+                    for i in batch_ids
+                ],
             )
         self._call_high_level = should_terminate
 
@@ -215,17 +252,21 @@ def act(
         # If any skills want to terminate invoke the high-level policy to get
         # the next skill.
         hl_terminate = torch.zeros(self._num_envs, dtype=torch.bool)
+        hl_info: Dict[str, Any] = {}
         if self._call_high_level.sum() > 0:
             (
                 new_skills,
                 new_skill_args,
                 hl_terminate,
+                hl_info,
             ) = self._high_level_policy.get_next_skill(
                 observations,
                 rnn_hidden_states,
                 prev_actions,
                 masks,
                 self._call_high_level,
+                deterministic,
+                log_info,
             )
 
             sel_grouped_skills = self._broadcast_skill_ids(
@@ -264,7 +305,7 @@ def act(
             },
         )
         for skill_id, (batch_ids, batch_dat) in grouped_skills.items():
-            tmp_actions, tmp_rnn = self._skills[skill_id].act(
+            action_data = self._skills[skill_id].act(
                 observations=batch_dat["observations"],
                 rnn_hidden_states=batch_dat["rnn_hidden_states"],
                 prev_actions=batch_dat["prev_actions"],
@@ -273,8 +314,8 @@ def act(
             )
 
             # LL skills are not allowed to terminate the overall episode.
-            actions[batch_ids] = tmp_actions
-            rnn_hidden_states[batch_ids] = tmp_rnn
+            actions[batch_ids] = action_data.actions
+            rnn_hidden_states[batch_ids] = action_data.rnn_hidden_states
         actions[:, self._stop_action_idx] = 0.0
 
         should_terminate = bad_should_terminate | hl_terminate
@@ -286,7 +327,17 @@ def act(
                 )
                 actions[batch_idx, self._stop_action_idx] = 1.0
 
-        return (None, actions, None, rnn_hidden_states)
+        ac_info = {
+            "values": None,
+            "action_log_probs": None,
+            "rnn_hidden_states": rnn_hidden_states,
+            "actions": actions,
+        }
+        ac_info.update(hl_info)
+
+        return PolicyAction(
+            take_actions=actions, policy_info=log_info, **ac_info
+        )
 
     @classmethod
     def from_config(
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/high_level_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/high_level_policy.py
deleted file mode 100644
index 082dc15434..0000000000
--- a/habitat-baselines/habitat_baselines/rl/hrl/high_level_policy.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and its affiliates.
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import List, Tuple
-
-import torch
-import yaml
-
-from habitat.config.default import get_full_habitat_config_path
-from habitat.tasks.rearrange.multi_task.rearrange_pddl import parse_func
-from habitat_baselines.common.logging import baselines_logger
-
-
-class FixedHighLevelPolicy:
-    """
-    :property _solution_actions: List of tuples were first tuple element is the
-        action name and the second is the action arguments.
-    """
-
-    _solution_actions: List[Tuple[str, List[str]]]
-
-    def __init__(self, config, task_spec_file, num_envs, skill_name_to_idx):
-        with open(get_full_habitat_config_path(task_spec_file), "r") as f:
-            task_spec = yaml.safe_load(f)
-
-        self._solution_actions = []
-        if "solution" not in task_spec:
-            raise ValueError(
-                f"The ground truth task planner only works when the task solution is hard-coded in the PDDL problem file at {task_spec_file}"
-            )
-        for i, sol_step in enumerate(task_spec["solution"]):
-            sol_action = parse_func(sol_step)
-            self._solution_actions.append(sol_action)
-            if config.add_arm_rest and i < (len(task_spec["solution"]) - 1):
-                self._solution_actions.append(parse_func("reset_arm(0)"))
-
-        # Add a wait action at the end.
-        self._solution_actions.append(parse_func("wait(30)"))
-
-        self._next_sol_idxs = torch.zeros(num_envs, dtype=torch.int32)
-        self._num_envs = num_envs
-        self._skill_name_to_idx = skill_name_to_idx
-
-    def apply_mask(self, mask):
-        self._next_sol_idxs *= mask.cpu().view(-1)
-
-    def get_next_skill(
-        self, observations, rnn_hidden_states, prev_actions, masks, plan_masks
-    ):
-        next_skill = torch.zeros(self._num_envs)
-        skill_args_data = [None for _ in range(self._num_envs)]
-        immediate_end = torch.zeros(self._num_envs, dtype=torch.bool)
-        for batch_idx, should_plan in enumerate(plan_masks):
-            if should_plan == 1.0:
-                if self._next_sol_idxs[batch_idx] >= len(
-                    self._solution_actions
-                ):
-                    baselines_logger.info(
-                        f"Calling for immediate end with {self._next_sol_idxs[batch_idx]}"
-                    )
-                    immediate_end[batch_idx] = True
-                    use_idx = len(self._solution_actions) - 1
-                else:
-                    use_idx = self._next_sol_idxs[batch_idx].item()
-
-                skill_name, skill_args = self._solution_actions[use_idx]
-                baselines_logger.info(
-                    f"Got next element of the plan with {skill_name}, {skill_args}"
-                )
-                if skill_name not in self._skill_name_to_idx:
-                    raise ValueError(
-                        f"Could not find skill named {skill_name} in {self._skill_name_to_idx}"
-                    )
-                next_skill[batch_idx] = self._skill_name_to_idx[skill_name]
-
-                skill_args_data[batch_idx] = skill_args  # type: ignore[call-overload]
-
-                self._next_sol_idxs[batch_idx] += 1
-
-        return next_skill, skill_args_data, immediate_end
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/__init__.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/__init__.py
index 09023c316f..31bc4beeb1 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/__init__.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/__init__.py
@@ -1,4 +1,5 @@
 from habitat_baselines.rl.hrl.hl.fixed_policy import FixedHighLevelPolicy
 from habitat_baselines.rl.hrl.hl.high_level_policy import HighLevelPolicy
+from habitat_baselines.rl.hrl.hl.neural_policy import NeuralHighLevelPolicy
 
-__all__ = ["HighLevelPolicy", "FixedHighLevelPolicy"]
+__all__ = ["HighLevelPolicy", "FixedHighLevelPolicy", "NeuralHighLevelPolicy"]
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/fixed_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/fixed_policy.py
index acbed7d418..8b6e6a6552 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/fixed_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/fixed_policy.py
@@ -5,9 +5,7 @@
 from typing import List, Tuple
 
 import torch
-import yaml
 
-from habitat.config.default import get_full_habitat_config_path
 from habitat.tasks.rearrange.multi_task.rearrange_pddl import parse_func
 from habitat_baselines.common.logging import baselines_logger
 from habitat_baselines.rl.hrl.hl.high_level_policy import HighLevelPolicy
@@ -23,39 +21,25 @@ class FixedHighLevelPolicy(HighLevelPolicy):
 
     _solution_actions: List[Tuple[str, List[str]]]
 
-    def __init__(self, config, task_spec_file, num_envs, skill_name_to_idx):
-        """
-        Initialize the `FixedHighLevelPolicy` object.
-
-        Args:
-            config: Config object containing the configurations for the agent.
-            task_spec_file: Path to the task specification file.
-            num_envs: Number of parallel environments.
-            skill_name_to_idx: Dictionary mapping skill names to skill indices.
-        """
-        with open(get_full_habitat_config_path(task_spec_file), "r") as f:
-            task_spec = yaml.safe_load(f)
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
-        self._num_envs = num_envs
-        self._skill_name_to_idx = skill_name_to_idx
         self._solution_actions = self._parse_solution_actions(
-            config, task_spec, task_spec_file
+            self._pddl_prob.solution
         )
 
-        self._next_sol_idxs = torch.zeros(num_envs, dtype=torch.int32)
-
-    def _parse_solution_actions(self, config, task_spec, task_spec_file):
-        if "solution" not in task_spec:
-            raise ValueError(
-                f"The ground truth task planner only works when the task solution is hard-coded in the PDDL problem file at {task_spec_file}."
-            )
+        self._next_sol_idxs = torch.zeros(self._num_envs, dtype=torch.int32)
 
+    def _parse_solution_actions(self, solution):
         solution_actions = []
-        for i, sol_step in enumerate(task_spec["solution"]):
-            sol_action = parse_func(sol_step)
+        for i, sol_step in enumerate(solution):
+            sol_action = (
+                sol_step.name,
+                [x.name for x in sol_step.param_values],
+            )
             solution_actions.append(sol_action)
 
-            if config.add_arm_rest and i < (len(task_spec["solution"]) - 1):
+            if self._config.add_arm_rest and i < (len(solution) - 1):
                 solution_actions.append(parse_func("reset_arm(0)"))
 
         # Add a wait action at the end.
@@ -93,26 +77,15 @@ def _get_next_sol_idx(self, batch_idx, immediate_end):
             return self._next_sol_idxs[batch_idx].item()
 
     def get_next_skill(
-        self, observations, rnn_hidden_states, prev_actions, masks, plan_masks
+        self,
+        observations,
+        rnn_hidden_states,
+        prev_actions,
+        masks,
+        plan_masks,
+        deterministic,
+        log_info,
     ):
-        """
-        Get the next skill to be executed.
-
-        Args:
-            observations: Current observations.
-            rnn_hidden_states: Current hidden states of the RNN.
-            prev_actions: Previous actions taken.
-            masks: Binary masks indicating which environment(s) are active.
-            plan_masks: Binary masks indicating which environment(s) should
-                plan the next skill.
-
-        Returns:
-            A tuple containing:
-            - next_skill: Next skill to be executed.
-            - skill_args_data: Arguments for the next skill.
-            - immediate_end: Binary masks indicating which environment(s) should
-                end immediately.
-        """
         next_skill = torch.zeros(self._num_envs)
         skill_args_data = [None for _ in range(self._num_envs)]
         immediate_end = torch.zeros(self._num_envs, dtype=torch.bool)
@@ -134,4 +107,4 @@ def get_next_skill(
 
                 self._next_sol_idxs[batch_idx] += 1
 
-        return next_skill, skill_args_data, immediate_end
+        return next_skill, skill_args_data, immediate_end, {}
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
index 848ebd92e1..27bb1ac0f7 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
@@ -1,15 +1,87 @@
-from typing import Any, List, Tuple
+from typing import Any, Dict, List, Tuple
 
+import gym.spaces as spaces
 import torch
+import torch.nn as nn
+
+from habitat.tasks.rearrange.multi_task.pddl_domain import PddlProblem
 
 
 class HighLevelPolicy:
+    def __init__(
+        self,
+        config,
+        pddl_problem: PddlProblem,
+        num_envs: int,
+        skill_name_to_idx: Dict[int, str],
+        observation_space: spaces.Space,
+        action_space: spaces.Space,
+    ):
+        self._config = config
+        self._pddl_prob = pddl_problem
+        self._num_envs = num_envs
+        self._skill_name_to_idx = skill_name_to_idx
+        self._obs_space = observation_space
+        self._device = None
+
+    def to(self, device):
+        self._device = device
+
+    @property
+    def num_recurrent_layers(self):
+        return 0
+
+    def parameters(self):
+        return nn.Parameter(torch.zeros((1,), device=self._device))
+
     def get_next_skill(
-        self, observations, rnn_hidden_states, prev_actions, masks, plan_masks
-    ) -> Tuple[torch.Tensor, List[Any], torch.BoolTensor]:
+        self,
+        observations,
+        rnn_hidden_states: torch.Tensor,
+        prev_actions: torch.Tensor,
+        masks: torch.Tensor,
+        plan_masks: torch.Tensor,
+        deterministic: bool,
+        log_info: List[Dict[str, Any]],
+    ) -> Tuple[torch.Tensor, List[Any], torch.BoolTensor, Dict[str, Any]]:
         """
-        :returns: A tuple containing the next skill index, a list of arguments
-            for the skill, and if the high-level policy requests immediate
-            termination.
+        Get the next skill to be executed.
+
+        Args:
+            observations: Current observations.
+            rnn_hidden_states: Current hidden states of the RNN.
+            prev_actions: Previous actions taken.
+            masks: Binary masks indicating which environment(s) are active.
+            plan_masks: Binary masks indicating which environment(s) should
+                plan the next skill.
+
+        Returns:
+            A tuple containing:
+            - next_skill: Next skill to be executed.
+            - skill_args_data: Arguments for the next skill.
+            - immediate_end: Binary masks indicating which environment(s) should
+                end immediately.
+            - Information for PolicyAction
         """
         raise NotImplementedError()
+
+    def apply_mask(self, mask: torch.Tensor) -> None:
+        pass
+
+    def get_termination(
+        self,
+        observations,
+        rnn_hidden_states,
+        prev_actions,
+        masks,
+        cur_skills,
+        log_info,
+    ) -> torch.BoolTensor:
+        """
+        Can force the currently executing skill to terminate.
+
+        Returns: A binary tensor where 1 indicates the current skill should
+        terminate and 0 indicates the skill can continue.
+        """
+
+        return torch.zeros(self._num_envs, dtype=torch.bool)
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
new file mode 100644
index 0000000000..c855e4fcc5
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
@@ -0,0 +1,175 @@
+from itertools import chain
+from typing import Any, List
+
+import gym.spaces as spaces
+import numpy as np
+import torch
+import torch.nn as nn
+
+from habitat.tasks.rearrange.multi_task.pddl_action import PddlAction
+from habitat_baselines.rl.ddppo.policy import resnet
+from habitat_baselines.rl.ddppo.policy.resnet_policy import ResNetEncoder
+from habitat_baselines.rl.hrl.hl.high_level_policy import HighLevelPolicy
+from habitat_baselines.rl.models.rnn_state_encoder import (
+    build_rnn_state_encoder,
+)
+from habitat_baselines.rl.ppo.policy import CriticHead
+from habitat_baselines.utils.common import CategoricalNet
+
+
+class NeuralHighLevelPolicy(HighLevelPolicy):
+    """
+    A trained high-level policy that selects low-level skills and their skill
+    inputs. Is limited to discrete skills and discrete skill inputs. The policy
+    detects the available skills and their possible arguments via the PDDL
+    problem.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._all_actions = self._setup_actions()
+        self._n_actions = len(self._all_actions)
+
+        use_obs_space = spaces.Dict(
+            {
+                k: self._obs_space.spaces[k]
+                for k in self._config.policy_input_keys
+            }
+        )
+        self._im_obs_space = spaces.Dict(
+            {k: v for k, v in use_obs_space.items() if len(v.shape) == 3}
+        )
+
+        state_obs_space = {
+            k: v for k, v in use_obs_space.items() if len(v.shape) == 1
+        }
+        self._state_obs_space = spaces.Dict(state_obs_space)
+
+        rnn_input_size = sum(
+            v.shape[0] for v in self._state_obs_space.values()
+        )
+        self._hidden_size = self._config.hidden_dim
+        if len(self._im_obs_space) > 0 and self._config.backbone != "NONE":
+            resnet_baseplanes = 32
+            self._visual_encoder = ResNetEncoder(
+                self._im_obs_space,
+                baseplanes=resnet_baseplanes,
+                ngroups=resnet_baseplanes // 2,
+                make_backbone=getattr(resnet, self._config.backbone),
+            )
+            self._visual_fc = nn.Sequential(
+                nn.Flatten(),
+                nn.Linear(
+                    np.prod(self._visual_encoder.output_shape),
+                    self._hidden_size,
+                ),
+                nn.ReLU(True),
+            )
+            rnn_input_size += self._hidden_size
+        else:
+            self._visual_encoder = nn.Sequential()
+            self._visual_fc = nn.Sequential()
+
+        self._state_encoder = build_rnn_state_encoder(
+            rnn_input_size,
+            self._hidden_size,
+            rnn_type=self._config.rnn_type,
+            num_layers=self._config.num_rnn_layers,
+        )
+        self._policy = CategoricalNet(self._hidden_size, self._n_actions)
+        self._critic = CriticHead(self._hidden_size)
+
+    def _setup_actions(self) -> List[PddlAction]:
+        all_actions = self._pddl_prob.get_possible_actions()
+        if not self._config.allow_other_place:
+            all_actions = [
+                ac
+                for ac in all_actions
+                if (
+                    ac.name != "place"
+                    or ac.param_values[0].name in ac.param_values[1].name
+                )
+            ]
+        return all_actions
+
+    @property
+    def num_recurrent_layers(self):
+        return self._state_encoder.num_recurrent_layers
+
+    def parameters(self):
+        return chain(
+            self._visual_encoder.parameters(),
+            self._visual_fc.parameters(),
+            self._policy.parameters(),
+            self._state_encoder.parameters(),
+            self._critic.parameters(),
+        )
+
+    def forward(self, obs, rnn_hidden_states, masks):
+        hidden = []
+        if len(self._im_obs_space) > 0:
+            im_obs = {k: obs[k] for k in self._im_obs_space.keys()}
+            visual_features = self._visual_encoder(im_obs)
+            visual_features = self._visual_fc(visual_features)
+            hidden.append(visual_features)
+
+        if len(self._state_obs_space) > 0:
+            hidden.extend([obs[k] for k in self._state_obs_space.keys()])
+        hidden = torch.cat(hidden, -1)
+
+        return self._state_encoder(hidden, rnn_hidden_states, masks)
+
+    def to(self, device):
+        self._device = device
+        return super().to(device)
+
+    def get_value(self, observations, rnn_hidden_states, prev_actions, masks):
+        state, _ = self.forward(observations, rnn_hidden_states, masks)
+        return self._critic(state)
+
+    def get_next_skill(
+        self,
+        observations,
+        rnn_hidden_states,
+        prev_actions,
+        masks,
+        plan_masks,
+        deterministic,
+        log_info,
+    ):
+        next_skill = torch.zeros(self._num_envs, dtype=torch.long)
+        skill_args_data: List[Any] = [None for _ in range(self._num_envs)]
+        immediate_end = torch.zeros(self._num_envs, dtype=torch.bool)
+
+        state, rnn_hidden_states = self.forward(
+            observations, rnn_hidden_states, masks
+        )
+        distrib = self._policy(state)
+        values = self._critic(state)
+        if deterministic:
+            skill_sel = distrib.mode()
+        else:
+            skill_sel = distrib.sample()
+        action_log_probs = distrib.log_probs(skill_sel)
+
+        for batch_idx, should_plan in enumerate(plan_masks):
+            if should_plan != 1.0:
+                continue
+            use_ac = self._all_actions[skill_sel[batch_idx]]
+            next_skill[batch_idx] = self._skill_name_to_idx[use_ac.name]
+            skill_args_data[batch_idx] = [
+                entity.name for entity in use_ac.param_values
+            ]
+            log_info[batch_idx]["nn_action"] = use_ac.compact_str
+
+        return (
+            next_skill,
+            skill_args_data,
+            immediate_end,
+            {
+                "action_log_probs": action_log_probs,
+                "values": values,
+                "actions": skill_sel,
+                "rnn_hidden_states": rnn_hidden_states,
+            },
+        )
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/__init__.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/__init__.py
index 1b1536f999..1187ac3cd9 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/__init__.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/__init__.py
@@ -5,6 +5,7 @@
 from habitat_baselines.rl.hrl.skills.art_obj import ArtObjSkillPolicy
 from habitat_baselines.rl.hrl.skills.nav import NavSkillPolicy
 from habitat_baselines.rl.hrl.skills.nn_skill import NnSkillPolicy
+from habitat_baselines.rl.hrl.skills.noop import NoopSkillPolicy
 from habitat_baselines.rl.hrl.skills.oracle_nav import OracleNavPolicy
 from habitat_baselines.rl.hrl.skills.pick import PickSkillPolicy
 from habitat_baselines.rl.hrl.skills.place import PlaceSkillPolicy
@@ -22,4 +23,5 @@
     "ResetArmSkill",
     "SkillPolicy",
     "WaitSkillPolicy",
+    "NoopSkillPolicy",
 ]
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py
index be9ebb2258..ed153443f6 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py
@@ -14,6 +14,7 @@
 from habitat_baselines.common.tensor_dict import TensorDict
 from habitat_baselines.config.default import get_config
 from habitat_baselines.rl.hrl.skills.skill import SkillPolicy
+from habitat_baselines.rl.ppo.policy import PolicyAction
 from habitat_baselines.utils.common import get_num_actions
 
 
@@ -118,7 +119,7 @@ def _internal_act(
         masks,
         cur_batch_idx,
         deterministic=False,
-    ):
+    ) -> PolicyAction:
         filtered_obs = self._get_filtered_obs(observations, cur_batch_idx)
 
         filtered_prev_actions = prev_actions[
@@ -126,7 +127,7 @@ def _internal_act(
         ]
         filtered_obs = self._select_obs(filtered_obs, cur_batch_idx)
 
-        _, action, _, rnn_hidden_states = self._wrap_policy.act(
+        action_data = self._wrap_policy.act(
             filtered_obs,
             rnn_hidden_states,
             filtered_prev_actions,
@@ -134,11 +135,15 @@ def _internal_act(
             deterministic,
         )
         full_action = torch.zeros(prev_actions.shape, device=masks.device)
-        full_action[:, self._ac_start : self._ac_start + self._ac_len] = action
+        full_action[
+            :, self._ac_start : self._ac_start + self._ac_len
+        ] = action_data.actions
+        action_data.actions = full_action
+
         self._did_want_done[cur_batch_idx] = full_action[
             cur_batch_idx, self._stop_action_idx
         ]
-        return full_action, rnn_hidden_states
+        return action_data
 
     @classmethod
     def from_config(
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/noop.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/noop.py
new file mode 100644
index 0000000000..6456344cb9
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/noop.py
@@ -0,0 +1,39 @@
+from typing import Any
+
+import gym.spaces as spaces
+import torch
+
+from habitat_baselines.rl.hrl.skills.skill import SkillPolicy
+from habitat_baselines.rl.ppo.policy import PolicyAction
+
+
+class NoopSkillPolicy(SkillPolicy):
+    def __init__(
+        self,
+        config,
+        action_space: spaces.Space,
+        batch_size,
+    ):
+        super().__init__(config, action_space, batch_size, False)
+
+    def _parse_skill_arg(self, *args, **kwargs) -> Any:
+        pass
+
+    def _is_skill_done(
+        self, observations, rnn_hidden_states, prev_actions, masks, batch_idx
+    ) -> torch.BoolTensor:
+        return torch.zeros(masks.size(0), dtype=torch.bool)
+
+    def _internal_act(
+        self,
+        observations,
+        rnn_hidden_states,
+        prev_actions,
+        masks,
+        cur_batch_idx,
+        full_action,
+        deterministic=False,
+    ):
+        return PolicyAction(
+            actions=full_action, rnn_hidden_states=rnn_hidden_states
+        )
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
index 7ba03a0bd7..e58a68daad 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
@@ -17,6 +17,7 @@
 from habitat_baselines.common.logging import baselines_logger
 from habitat_baselines.rl.hrl.skills.nn_skill import NnSkillPolicy
 from habitat_baselines.rl.hrl.utils import find_action_range
+from habitat_baselines.rl.ppo.policy import PolicyAction
 
 
 class OracleNavPolicy(NnSkillPolicy):
@@ -87,7 +88,7 @@ def from_config(
         cls, config, observation_space, action_space, batch_size, full_config
     ):
         filtered_action_space = ActionSpace(
-            {config.NAV_ACTION_NAME: action_space[config.NAV_ACTION_NAME]}
+            {config.nav_action_name: action_space[config.nav_action_name]}
         )
         baselines_logger.debug(
             f"Loaded action space {filtered_action_space} for skill {config.skill_name}"
@@ -184,4 +185,6 @@ def _internal_act(
 
         full_action[:, self._oracle_nav_ac_idx] = action_idxs
 
-        return full_action, rnn_hidden_states
+        return PolicyAction(
+            actions=full_action, rnn_hidden_states=rnn_hidden_states
+        )
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/pick.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/pick.py
index 3f88f1b1c4..4efb63d415 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/pick.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/pick.py
@@ -9,6 +9,7 @@
     RelativeRestingPositionSensor,
 )
 from habitat_baselines.rl.hrl.skills.nn_skill import NnSkillPolicy
+from habitat_baselines.rl.ppo.policy import PolicyAction
 
 
 class PickSkillPolicy(NnSkillPolicy):
@@ -33,12 +34,12 @@ def _parse_skill_arg(self, skill_arg):
         self._internal_log(f"Parsing skill argument {skill_arg}")
         return int(skill_arg[0].split("|")[1])
 
-    def _mask_pick(self, action, observations):
+    def _mask_pick(self, action: PolicyAction, observations) -> PolicyAction:
         # Mask out the release if the object is already held.
         is_holding = observations[IsHoldingSensor.cls_uuid].view(-1)
         for i in torch.nonzero(is_holding):
             # Do not release the object once it is held
-            action[i, self._grip_ac_idx] = 1.0
+            action.actions[i, self._grip_ac_idx] = 1.0
         return action
 
     def _internal_act(
@@ -50,7 +51,7 @@ def _internal_act(
         cur_batch_idx,
         deterministic=False,
     ):
-        action, hxs = super()._internal_act(
+        action = super()._internal_act(
             observations,
             rnn_hidden_states,
             prev_actions,
@@ -59,4 +60,4 @@ def _internal_act(
             deterministic,
         )
         action = self._mask_pick(action, observations)
-        return action, hxs
+        return action
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
index 9e33f713c0..7d17aead32 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
@@ -9,6 +9,7 @@
 import torch
 
 from habitat_baselines.rl.hrl.skills.skill import SkillPolicy
+from habitat_baselines.rl.ppo.policy import PolicyAction
 from habitat_baselines.utils.common import get_num_actions
 
 
@@ -93,4 +94,6 @@ def _internal_act(
             delta
         ).to(device=action.device, dtype=action.dtype)
 
-        return action, rnn_hidden_states
+        return PolicyAction(
+            actions=action, rnn_hidden_states=rnn_hidden_states
+        )
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index 7b857288c5..b0fd2ddb42 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -10,7 +10,7 @@
 from habitat.tasks.rearrange.rearrange_sensors import IsHoldingSensor
 from habitat_baselines.common.logging import baselines_logger
 from habitat_baselines.rl.hrl.utils import find_action_range
-from habitat_baselines.rl.ppo.policy import Policy
+from habitat_baselines.rl.ppo.policy import Policy, PolicyAction
 from habitat_baselines.utils.common import get_num_actions
 
 
@@ -38,6 +38,14 @@ def __init__(
             None for _ in range(self._batch_size)
         ]
 
+        if "pddl_apply_action" in action_space:
+            self._pddl_ac_start, _ = find_action_range(
+                action_space, "pddl_apply_action"
+            )
+        else:
+            self._pddl_ac_start = None
+        self._delay_term = [None for _ in range(self._batch_size)]
+
         self._grip_ac_idx = 0
         found_grip = False
         for k, space in action_space.items():
@@ -68,8 +76,8 @@ def _get_multi_sensor_index(self, batch_idx: List[int]) -> List[int]:
         return [self._cur_skill_args[i] for i in batch_idx]
 
     def _keep_holding_state(
-        self, full_action: torch.Tensor, observations
-    ) -> torch.Tensor:
+        self, action_data: PolicyAction, observations
+    ) -> PolicyAction:
         """
         Makes the action so it does not result in dropping or picking up an
         object. Used in navigation and other skills which are not supposed to
@@ -79,8 +87,51 @@ def _keep_holding_state(
         is_holding = observations[IsHoldingSensor.cls_uuid].view(-1)
         # If it is not holding (0) want to keep releasing -> output -1.
         # If it is holding (1) want to keep grasping -> output +1.
-        full_action[:, self._grip_ac_idx] = is_holding + (is_holding - 1.0)
-        return full_action
+        action_data.write_action(
+            self._grip_ac_idx, is_holding + (is_holding - 1.0)
+        )
+        return action_data
+
+    def _apply_postcond(
+        self,
+        actions,
+        log_info,
+        skill_name,
+        env_i,
+        idx,
+    ):
+        skill_args = self._raw_skill_args[env_i]
+        action = self._pddl_problem.actions[skill_name]
+
+        entities = [self._pddl_problem.get_entity(x) for x in skill_args]
+
+        ac_idx = self._pddl_ac_start
+        found = False
+        for other_action in self._action_ordering:
+            if other_action.name != action.name:
+                ac_idx += other_action.n_args
+            else:
+                found = True
+                break
+        if not found:
+            raise ValueError(f"Could not find action {action}")
+
+        entity_idxs = [
+            self._entities_list.index(entity) + 1 for entity in entities
+        ]
+        if len(entity_idxs) != action.n_args:
+            raise ValueError(
+                f"Inconsistent # of args {action.n_args} versus {entity_idxs} for {action} with {skill_args} and {entities}"
+            )
+
+        actions[idx, ac_idx : ac_idx + action.n_args] = torch.tensor(
+            entity_idxs, dtype=actions.dtype, device=actions.device
+        )
+        apply_action = action.clone()
+        apply_action.set_param_values(entities)
+
+        log_info[env_i]["pddl_action"] = apply_action.compact_str
+        return actions
 
     def should_terminate(
         self,
@@ -88,7 +139,10 @@ def should_terminate(
         rnn_hidden_states,
         prev_actions,
         masks,
-        batch_idx,
+        hl_says_term,
+        batch_idx: List[int],
+        skill_name: List[str],
+        log_info,
     ) -> Tuple[torch.BoolTensor, torch.BoolTensor]:
         """
         :returns: A (batch_size,) size tensor where 1 indicates the skill wants to end and 0 if not.
@@ -114,6 +168,22 @@ def should_terminate(
             else:
                 is_skill_done = is_skill_done | over_max_len
 
+        # for i, env_i in enumerate(batch_idx):
+        #     if self._delay_term[env_i]:
+        #         self._delay_term[env_i] = False
+        #         is_skill_done[i] = 1.0
+        #     elif (
+        #         self._config.apply_postconds
+        #         and is_skill_done[i] == 1.0
+        #         and hl_says_term[i] == 0.0
+        #     ):
+        #         actions = self._apply_postcond(
+        #             actions, log_info, skill_name[i], env_i, i
+        #         )
+        #         self._delay_term[env_i] = True
+        #         is_skill_done[i] = 0.0
+        is_skill_done |= hl_says_term
+
         if bad_terminate.sum() > 0:
             self._internal_log(
                 f"Bad terminating due to timeout {self._cur_skill_step}, {bad_terminate}",
@@ -147,6 +217,11 @@ def on_enter(
             prev_actions[batch_idxs] * 0.0,
         )
 
+    def set_pddl_problem(self, pddl_prob):
+        self._pddl_problem = pddl_prob
+        self._entities_list = self._pddl_problem.get_ordered_entities_list()
+        self._action_ordering = self._pddl_problem.get_ordered_actions()
+
     @classmethod
     def from_config(
         cls, config, observation_space, action_space, batch_size, full_config
@@ -166,7 +241,7 @@ def act(
         :returns: Predicted action and next rnn hidden state.
         """
         self._cur_skill_step[cur_batch_idx] += 1
-        action, hxs = self._internal_act(
+        action_data = self._internal_act(
             observations,
             rnn_hidden_states,
             prev_actions,
@@ -176,8 +251,8 @@ def act(
         )
 
         if self._should_keep_hold_state:
-            action = self._keep_holding_state(action, observations)
-        return action, hxs
+            action_data = self._keep_holding_state(action_data, observations)
+        return action_data
 
     def to(self, device):
         self._cur_skill_step = self._cur_skill_step.to(device)
@@ -196,9 +271,7 @@ def _select_obs(self, obs, cur_batch_idx):
                 )
 
             entity_positions = obs[k].view(
-                len(cur_batch_idx),
-                -1,
-                self._config.get("obs_skill_input_dim", 3),
+                len(cur_batch_idx), -1, self._config.obs_skill_input_dim
             )
             obs[k] = entity_positions[
                 torch.arange(len(cur_batch_idx)), cur_multi_sensor_index
@@ -229,5 +302,5 @@ def _internal_act(
         masks,
         cur_batch_idx,
         deterministic=False,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> PolicyAction:
         raise NotImplementedError()
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py
index 6fb9a1b552..e8a7ca03e1 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py
@@ -8,6 +8,7 @@
 import torch
 
 from habitat_baselines.rl.hrl.skills.skill import SkillPolicy
+from habitat_baselines.rl.ppo.policy import PolicyAction
 
 
 class WaitSkillPolicy(SkillPolicy):
@@ -40,4 +41,6 @@ def _internal_act(
         deterministic=False,
     ):
         action = torch.zeros(prev_actions.shape, device=prev_actions.device)
-        return action, rnn_hidden_states
+        return PolicyAction(
+            actions=action, rnn_hidden_states=rnn_hidden_states
+        )
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/utils.py b/habitat-baselines/habitat_baselines/rl/hrl/utils.py
index 7cebe7db67..0c22f3b183 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/utils.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/utils.py
@@ -24,5 +24,5 @@ def find_action_range(
             break
         start_idx += get_num_actions(action_space[k])
     if not found:
-        raise ValueError(f"Could not find stop action in {action_space}")
+        return None
     return start_idx, end_idx
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/policy.py b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
index 71a2dc170b..bfb8ac6f4a 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/policy.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
@@ -4,7 +4,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 import abc
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
 
 import torch
 from gym import spaces
@@ -30,6 +31,26 @@
     from omegaconf import DictConfig
 
 
+@dataclass
+class PolicyAction:
+    rnn_hidden_states: torch.Tensor
+    actions: torch.Tensor
+    values: Optional[torch.Tensor] = None
+    action_log_probs: Optional[torch.Tensor] = None
+    take_actions: Optional[torch.Tensor] = None
+    policy_info: Optional[List[Dict[str, Any]]] = None
+
+    def write_action(self, write_idx, write_action):
+        self.actions[:, write_idx] = write_action
+
+    @property
+    def env_actions(self) -> torch.Tensor:
+        if self.take_actions is None:
+            return self.actions
+        else:
+            return self.take_actions
+
+
 class Policy(abc.ABC):
     action_distribution: nn.Module
 
@@ -47,7 +68,9 @@ def num_recurrent_layers(self) -> int:
     def forward(self, *x):
         raise NotImplementedError
 
-    def get_policy_info(self, infos, dones) -> List[Dict[str, float]]:
+    def extract_policy_info(
+        self, action_data: PolicyAction, infos, dones
+    ) -> List[Dict[str, float]]:
         """
         Gets the log information from the policy at the current time step.
         Currently only called during evaluation. The return list should be
@@ -64,7 +87,7 @@ def act(
         prev_actions,
         masks,
         deterministic=False,
-    ):
+    ) -> PolicyAction:
         raise NotImplementedError
 
     @classmethod
@@ -155,8 +178,12 @@ def act(
             action = distribution.sample()
 
         action_log_probs = distribution.log_probs(action)
-
-        return value, action, action_log_probs, rnn_hidden_states
+        return PolicyAction(
+            values=value,
+            actions=value,
+            action_log_probs=action_log_probs,
+            rnn_hidden_states=rnn_hidden_states,
+        )
 
     def get_value(self, observations, rnn_hidden_states, prev_actions, masks):
         features, _, _ = self.net(
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
index 8f999efea6..42bb765815 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
@@ -469,12 +469,7 @@ def _compute_actions_and_step_envs(self, buffer_index: int = 0):
             ]
 
             profiling_wrapper.range_push("compute actions")
-            (
-                values,
-                actions,
-                actions_log_probs,
-                recurrent_hidden_states,
-            ) = self.actor_critic.act(
+            action_data = self.actor_critic.act(
                 step_batch["observations"],
                 step_batch["recurrent_hidden_states"],
                 step_batch["prev_actions"],
@@ -488,7 +483,8 @@ def _compute_actions_and_step_envs(self, buffer_index: int = 0):
         t_step_env = time.time()
 
         for index_env, act in zip(
-            range(env_slice.start, env_slice.stop), actions.cpu().unbind(0)
+            range(env_slice.start, env_slice.stop),
+            action_data.env_actions.cpu().unbind(0),
         ):
             if is_continuous_action_space(self.policy_action_space):
                 # Clipping actions to the specified limits
@@ -504,10 +500,10 @@ def _compute_actions_and_step_envs(self, buffer_index: int = 0):
         self.env_time += time.time() - t_step_env
 
         self.rollouts.insert(
-            next_recurrent_hidden_states=recurrent_hidden_states,
-            actions=actions,
-            action_log_probs=actions_log_probs,
-            value_preds=values,
+            next_recurrent_hidden_states=action_data.rnn_hidden_states,
+            actions=action_data.actions,
+            action_log_probs=action_data.action_log_probs,
+            value_preds=action_data.values,
             buffer_index=buffer_index,
         )
 
@@ -1052,20 +1048,16 @@ def _eval_checkpoint(
             current_episodes_info = self.envs.current_episodes()
 
             with inference_mode():
-                (
-                    _,
-                    actions,
-                    _,
-                    test_recurrent_hidden_states,
-                ) = self.actor_critic.act(
+                action_data = self.actor_critic.act(
                     batch,
                     test_recurrent_hidden_states,
                     prev_actions,
                     not_done_masks,
                     deterministic=False,
                 )
+                test_recurrent_hidden_states = action_data.rnn_hidden_states
 
-                prev_actions.copy_(actions)  # type: ignore
+                prev_actions.copy_(action_data.actions)  # type: ignore
             # NB: Move actions to CPU.  If CUDA tensors are
             # sent in to env.step(), that will create CUDA contexts
             # in the subprocesses.
@@ -1077,19 +1069,21 @@ def _eval_checkpoint(
                         self.policy_action_space.low,
                         self.policy_action_space.high,
                     )
-                    for a in actions.cpu()
+                    for a in action_data.env_actions.cpu()
                 ]
             else:
-                step_data = [a.item() for a in actions.cpu()]
+                step_data = [a.item() for a in action_data.env_actions.cpu()]
 
             outputs = self.envs.step(step_data)
 
             observations, rewards_l, dones, infos = [
                 list(x) for x in zip(*outputs)
             ]
-            policy_info = self.actor_critic.get_policy_info(infos, dones)
-            for i in range(len(policy_info)):
-                infos[i].update(policy_info[i])
+            policy_infos = self.actor_critic.extract_policy_info(
+                action_data, infos, dones
+            )
+            for i in range(len(policy_infos)):
+                infos[i].update(policy_infos[i])
             batch = batch_obs(  # type: ignore
                 observations,
                 device=self.device,
diff --git a/habitat-baselines/habitat_baselines/rl/ver/inference_worker.py b/habitat-baselines/habitat_baselines/rl/ver/inference_worker.py
index bd674a00a9..56ef3f7025 100644
--- a/habitat-baselines/habitat_baselines/rl/ver/inference_worker.py
+++ b/habitat-baselines/habitat_baselines/rl/ver/inference_worker.py
@@ -323,6 +323,7 @@ def step(self) -> Tuple[bool, List[Tuple[int, int]]]:
                 actions,
                 actions_log_probs,
                 next_recurrent_hidden_states,
+                policy_info,
             ) = self.actor_critic.act(
                 obs,
                 recurrent_hidden_states,
diff --git a/habitat-lab/habitat/tasks/rearrange/multi_task/pddl_action.py b/habitat-lab/habitat/tasks/rearrange/multi_task/pddl_action.py
index c1ba9d413d..00598f6d58 100644
--- a/habitat-lab/habitat/tasks/rearrange/multi_task/pddl_action.py
+++ b/habitat-lab/habitat/tasks/rearrange/multi_task/pddl_action.py
@@ -162,7 +162,7 @@ def apply(self, sim_info: PddlSimInfo) -> None:
             p.set_state(sim_info)
 
     @property
-    def param_values(self):
+    def param_values(self) -> Optional[List[PddlEntity]]:
         if self._param_values is None:
             raise ValueError(
                 "Accessing action param values before they are set."
diff --git a/test/test_baseline_training.py b/test/test_baseline_training.py
index b129bbff75..eacd10759d 100644
--- a/test/test_baseline_training.py
+++ b/test/test_baseline_training.py
@@ -13,6 +13,7 @@
 
 from habitat.config import read_write
 from habitat.config.default import get_agent_config
+from habitat_baselines.run import execute_exp
 
 try:
     import torch
@@ -148,6 +149,63 @@ def test_trainers(config_path, num_updates, overrides, trainer_name):
     # Training should complete without raising an error.
 
 
+@pytest.mark.parametrize(
+    "config_path,mode,trainer_name",
+    [
+        (
+            "habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml",
+            "eval",
+            "ppo",
+        ),
+        (
+            "habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml",
+            "eval",
+            "ppo",
+        ),
+        (
+            "habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml",
+            "eval",
+            "ppo",
+        ),
+    ],
+)
+def test_hrl(config_path, mode, trainer_name):
+    # Remove the checkpoints from previous tests
+    for f in glob.glob("data/test_checkpoints/test_training/*"):
+        os.remove(f)
+
+    # Setup the training
+    config = get_config(
+        config_path,
+        [
+            "habitat_baselines.num_updates=1",
+            "habitat_baselines.eval.split=minival",
+            "habitat_baselines.total_num_steps=-1.0",
+            "habitat_baselines.test_episode_count=1",
+            "habitat_baselines.checkpoint_folder=data/test_checkpoints/test_training",
+            f"habitat_baselines.trainer_name={trainer_name}",
+        ],
+    )
+    with read_write(config):
+        config.habitat_baselines.eval.update({"video_option": []})
+        for (
+            skill_name,
+            skill,
+        ) in (
+            config.habitat_baselines.rl.policy.hierarchical_policy.defined_skills.items()
+        ):
+            if skill.load_ckpt_file == "":
+                continue
+            skill.update(
+                {
+                    "force_config_file": f"benchmark/rearrange={skill_name}",
+                    "max_skill_steps": 1,
+                    "load_ckpt_file": "",
+                }
+            )
+        execute_exp(config, mode)
+
+
 @pytest.mark.skipif(
     int(os.environ.get("TEST_BASELINE_SMALL", 0)) == 0,
     reason="Full training tests did not run. Need `export TEST_BASELINE_SMALL=1",
diff --git a/test/test_rearrange_task.py b/test/test_rearrange_task.py
index ed2fed6306..99d012871c 100644
--- a/test/test_rearrange_task.py
+++ b/test/test_rearrange_task.py
@@ -4,16 +4,12 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import gc
-import itertools
 import json
-import os
 import os.path as osp
 import time
 from glob import glob
 
 import pytest
-import torch
 import yaml
 from omegaconf import DictConfig, OmegaConf
 
@@ -29,8 +25,6 @@
 from habitat.datasets.rearrange.rearrange_dataset import RearrangeDatasetV0
 from habitat.tasks.rearrange.multi_task.composite_task import CompositeTask
 from habitat_baselines.config.default import get_config as baselines_get_config
-from habitat_baselines.rl.ddppo.ddp_utils import find_free_port
-from habitat_baselines.run import run_exp
 
 CFG_TEST = "benchmark/rearrange/pick.yaml"
 GEN_TEST_CFG = (
@@ -109,33 +103,6 @@ def test_rearrange_baseline_envs(test_cfg_path):
                 )
 
 
-@pytest.mark.parametrize(
-    "test_cfg_path",
-    list(
-        glob("habitat-lab/habitat/config/benchmark/rearrange/*"),
-    ),
-)
-def test_rearrange_tasks(test_cfg_path):
-    """
-    Test the underlying Habitat Tasks
-    """
-    if not osp.isfile(test_cfg_path):
-        return
-
-    config = get_config(test_cfg_path)
-    if (
-        config.habitat.dataset.data_path
-        == "data/ep_datasets/bench_scene.json.gz"
-    ):
-        pytest.skip(
-            "This config is only useful for examples and does not have the generated dataset"
-        )
-
-    with habitat.Env(config=config) as env:
-        for _ in range(5):
-            env.reset()
-
-
 @pytest.mark.parametrize(
     "test_cfg_path",
     list(
@@ -209,32 +176,3 @@ def test_rearrange_episode_generator(
     logger.info(
         f"successful_ep = {len(dataset.episodes)} generated in {time.time()-start_time} seconds."
     )
-
-
-@pytest.mark.parametrize(
-    "test_cfg_path,mode",
-    list(
-        itertools.product(
-            glob("habitat-baselines/habitat_baselines/config/tp_srl_test/*"),
-            ["eval"],
-        )
-    ),
-)
-def test_tp_srl(test_cfg_path, mode):
-    # For testing with world_size=1
-    os.environ["MAIN_PORT"] = str(find_free_port())
-
-    run_exp(
-        test_cfg_path.replace(
-            "habitat-baselines/habitat_baselines/config/", ""
-        ),
-        mode,
-        ["habitat_baselines.eval.split=train"],
-    )
-
-    # Needed to destroy the trainer
-    gc.collect()
-
-    # Deinit processes group
-    if torch.distributed.is_initialized():
-        torch.distributed.destroy_process_group()

From f6d1f110ad192cc500defbfd91a15f7d40cf6f77 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Mon, 26 Dec 2022 11:58:06 +0100
Subject: [PATCH 02/48] Working on HRL trainer

---
 .../common/baseline_registry.py               |  16 ++
 .../common/rollout_storage.py                 |   3 +
 .../config/default_structured_configs.py      |   3 +
 .../config/rearrange/rl_hl_srl_onav.yaml      |  45 ++--
 .../config/rearrange/tp_srl_oracle_nav.yaml   |  27 +-
 .../habitat_baselines/rl/ddppo/algo/ddppo.py  |   2 +
 .../habitat_baselines/rl/hrl/__init__.py      |   8 +
 .../rl/hrl/hierarchical_policy.py             |  15 +-
 .../rl/hrl/hl/high_level_policy.py            |   5 +-
 .../habitat_baselines/rl/hrl/hrl_ppo.py       | 128 +++++++++
 .../rl/hrl/hrl_rollout_storage.py             | 159 +++++++++++
 .../rl/hrl/skills/oracle_nav.py               |   2 +-
 .../habitat_baselines/rl/hrl/skills/skill.py  |  51 ++--
 .../habitat_baselines/rl/ppo/policy.py        |   1 +
 .../habitat_baselines/rl/ppo/ppo.py           | 255 +++++++++---------
 .../habitat_baselines/rl/ppo/ppo_trainer.py   |  27 +-
 .../config/default_structured_configs.py      |  22 +-
 .../habitat/task/rearrange/rearrange.yaml     |   1 +
 .../task/rearrange/rearrange_easy.yaml        |   1 +
 .../tasks/rearrange/actions/pddl_actions.py   |  32 ++-
 .../tasks/rearrange/sub_tasks/pick_task.py    |   2 +-
 test/test_baseline_training.py                |  28 +-
 22 files changed, 613 insertions(+), 220 deletions(-)
 create mode 100644 habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
 create mode 100644 habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py

diff --git a/habitat-baselines/habitat_baselines/common/baseline_registry.py b/habitat-baselines/habitat_baselines/common/baseline_registry.py
index 5038cf91a5..7ff89a28b6 100644
--- a/habitat-baselines/habitat_baselines/common/baseline_registry.py
+++ b/habitat-baselines/habitat_baselines/common/baseline_registry.py
@@ -136,5 +136,21 @@ def register_auxiliary_loss(
     def get_auxiliary_loss(cls, name: str):
         return cls._get_impl("aux_loss", name)
 
+    @classmethod
+    def register_storage(cls, to_register=None, *, name: Optional[str] = None):
+        return cls._register_impl("storage", to_register, name)
+
+    @classmethod
+    def get_storage(cls, name: str):
+        return cls._get_impl("storage", name)
+
+    @classmethod
+    def register_updater(cls, to_register=None, *, name: Optional[str] = None):
+        return cls._register_impl("updater", to_register, name)
+
+    @classmethod
+    def get_updater(cls, name: str):
+        return cls._get_impl("updater", name)
+
 
 baseline_registry = BaselineRegistry()
diff --git a/habitat-baselines/habitat_baselines/common/rollout_storage.py b/habitat-baselines/habitat_baselines/common/rollout_storage.py
index b9b2984820..8bc0b3b9f2 100644
--- a/habitat-baselines/habitat_baselines/common/rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/common/rollout_storage.py
@@ -10,6 +10,7 @@
 import numpy as np
 import torch
 
+from habitat_baselines.common.baseline_registry import baseline_registry
 from habitat_baselines.common.tensor_dict import DictTree, TensorDict
 from habitat_baselines.rl.models.rnn_state_encoder import (
     build_pack_info_from_dones,
@@ -17,6 +18,7 @@
 )
 
 
+@baseline_registry.register_storage
 class RolloutStorage:
     r"""Class for storing rollout information for RL trainers."""
 
@@ -116,6 +118,7 @@ def insert(
         rewards=None,
         next_masks=None,
         buffer_index: int = 0,
+        **kwargs,
     ):
         if not self.is_double_buffered:
             assert buffer_index == 0
diff --git a/habitat-baselines/habitat_baselines/config/default_structured_configs.py b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
index 56a51d6c82..699df3053e 100644
--- a/habitat-baselines/habitat_baselines/config/default_structured_configs.py
+++ b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
@@ -404,6 +404,8 @@ class HabitatBaselinesConfig(HabitatBaselinesBaseConfig):
     # )
     # cmd_trailing_opts: List[str] = field(default_factory=list)
     trainer_name: str = "ppo"
+    updater_name: str = "PPO"
+    distrib_updater_name: str = "DDPPO"
     torch_gpu_id: int = 0
     video_render_views: List[str] = field(default_factory=list)
     tensorboard_dir: str = "tb"
@@ -415,6 +417,7 @@ class HabitatBaselinesConfig(HabitatBaselinesBaseConfig):
     eval_ckpt_path_dir: str = "data/checkpoints"
     num_environments: int = 16
     num_processes: int = -1  # deprecated
+    rollout_storage: str = "RolloutStorage"
     checkpoint_folder: str = "data/checkpoints"
     num_updates: int = 10000
     num_checkpoints: int = 10
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml
index 3f542bbf09..1675eadb6c 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml
@@ -8,24 +8,43 @@ defaults:
   - /habitat_baselines/rl/policy/obs_transforms:
     - add_virtual_keys_base
   - /habitat/task/actions:
+    - pddl_apply_action
     - oracle_nav_action
     - arm_action
     - base_velocity
     - rearrange_stop
   - _self_
 
+habitat:
+  gym:
+    auto_name: RearrangeEasy
+    obs_keys:
+      - robot_head_depth
+      - relative_resting_position
+      - obj_start_sensor
+      - obj_goal_sensor
+      - obj_start_gps_compass
+      - obj_goal_gps_compass
+      - joint
+      - is_holding
+      - localization_sensor
+
+
 habitat_baselines:
   verbose: False
   trainer_name: "ddppo"
   torch_gpu_id: 0
   tensorboard_dir: "tb"
+  rollout_storage: "HrlRolloutStorage"
+  updater_name: "HrlPPO"
+  distrib_updater_name: "HrlDDPPO"
   video_dir: "video_dir"
   video_fps: 30
   video_render_views:
     - "third_rgb_sensor"
   test_episode_count: -1
   eval_ckpt_path_dir: ""
-  num_environments: 1
+  num_environments: 3
   writer_type: 'tb'
   checkpoint_folder: "data/new_checkpoints"
   num_updates: -1
@@ -62,24 +81,19 @@ habitat_baselines:
               - "robot_head_depth"
           defined_skills:
             open_cab:
-              skill_name: "ArtObjSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              load_ckpt_file: "data/models/open_cab.pth"
-              max_skill_steps: 200
+              skill_name: "NoopSkillPolicy"
+              max_skill_steps: 1
               apply_postconds: True
 
             open_fridge:
-              skill_name: "ArtObjSkillPolicy"
-              load_ckpt_file: "data/models/open_fridge.pth"
-              max_skill_steps: 200
+              skill_name: "NoopSkillPolicy"
+              max_skill_steps: 1
               apply_postconds: True
 
             close_cab:
               skill_name: "NoopSkillPolicy"
               obs_skill_inputs: ["obj_start_sensor"]
-              load_ckpt_file: "data/models/close_cab.pth"
-              max_skill_steps: 200
+              max_skill_steps: 1
 
             close_fridge:
               skill_name: "NoopSkillPolicy"
@@ -116,11 +130,10 @@ habitat_baselines:
               force_end_on_timeout: False
 
           use_skills:
-            # Uncomment if you are also using these skills
-            # open_cab: "open_cab"
-            # open_fridge: "open_fridge"
-            # close_cab: "open_cab"
-            # close_fridge: "open_fridge"
+            open_cab: "open_cab"
+            open_fridge: "open_fridge"
+            close_cab: "close_cab"
+            close_fridge: "close_fridge"
             pick: "pick"
             place: "place"
             nav: "nav_to_obj"
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml b/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml
index 5e8b59a944..f456242704 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml
@@ -12,20 +12,19 @@ defaults:
     - rearrange_stop
   - _self_
 
-AGENT_0_ORACLE_NAV_ACTION:
-          TYPE: "OracleNavAction"
-          TURN_VELOCITY: 1.0
-          FORWARD_VELOCITY: 1.0
-          TURN_THRESH: 0.1
-          DIST_THRESH: 0.2
-          AGENT: 0
-          LIN_SPEED: 40.0
-          ANG_SPEED: 20.0
-          MIN_ABS_LIN_SPEED: 1.0
-          MIN_ABS_ANG_SPEED: 1.0
-          ALLOW_DYN_SLIDE: False
-          END_ON_STOP: False
-          ALLOW_BACK: True
+habitat:
+  gym:
+    auto_name: RearrangeEasy
+    obs_keys:
+      - robot_head_depth
+      - relative_resting_position
+      - obj_start_sensor
+      - obj_goal_sensor
+      - obj_start_gps_compass
+      - obj_goal_gps_compass
+      - joint
+      - is_holding
+      - localization_sensor
 
 habitat_baselines:
   verbose: False
diff --git a/habitat-baselines/habitat_baselines/rl/ddppo/algo/ddppo.py b/habitat-baselines/habitat_baselines/rl/ddppo/algo/ddppo.py
index c7bacb6cd1..1e95c02b4b 100644
--- a/habitat-baselines/habitat_baselines/rl/ddppo/algo/ddppo.py
+++ b/habitat-baselines/habitat_baselines/rl/ddppo/algo/ddppo.py
@@ -10,6 +10,7 @@
 import torch
 from torch import distributed as distrib
 
+from habitat_baselines.common.baseline_registry import baseline_registry
 from habitat_baselines.rl.ppo import PPO
 
 
@@ -130,5 +131,6 @@ def _evaluate_actions(self, *args, **kwargs):
         )
 
 
+@baseline_registry.register_updater
 class DDPPO(DecentralizedDistributedMixin, PPO):
     pass
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/__init__.py b/habitat-baselines/habitat_baselines/rl/hrl/__init__.py
index 0f0db8cad5..75f7fadf5c 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/__init__.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/__init__.py
@@ -3,3 +3,11 @@
 # Copyright (c) Meta Platforms, Inc. and its affiliates.
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+
+from habitat_baselines.rl.hrl.hrl_ppo import HrlPPO
+from habitat_baselines.rl.hrl.hrl_rollout_storage import HrlRolloutStorage
+
+__all__ = [
+    "HrlPPO",
+    "HrlRolloutStorage",
+]
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
index f9217051e4..b8f63ad9eb 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
@@ -8,6 +8,7 @@
 
 import gym.spaces as spaces
 import torch
+import torch.nn as nn
 
 from habitat.core.spaces import ActionSpace
 from habitat.tasks.rearrange.multi_task.composite_sensors import (
@@ -38,7 +39,7 @@
 
 
 @baseline_registry.register_policy
-class HierarchicalPolicy(Policy):
+class HierarchicalPolicy(nn.Module, Policy):
     def __init__(
         self,
         config,
@@ -212,6 +213,11 @@ def act(
             self._cur_skills,
             log_info,
         )
+        # Compute the actions from the current skills
+        actions = torch.zeros(
+            (self._num_envs, get_num_actions(self._action_space)),
+            device=masks.device,
+        )
 
         grouped_skills = self._broadcast_skill_ids(
             self._cur_skills,
@@ -220,6 +226,7 @@ def act(
                 "rnn_hidden_states": rnn_hidden_states,
                 "prev_actions": prev_actions,
                 "masks": masks,
+                "actions": actions,
                 "hl_says_term": hl_says_term,
             },
             # Only decide on skill termination if the episode is active.
@@ -289,12 +296,6 @@ def act(
                 (~self._call_high_level) * self._cur_skills
             ) + (self._call_high_level * new_skills)
 
-        # Compute the actions from the current skills
-        actions = torch.zeros(
-            (self._num_envs, get_num_actions(self._action_space)),
-            device=masks.device,
-        )
-
         grouped_skills = self._broadcast_skill_ids(
             self._cur_skills,
             sel_dat={
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
index 27bb1ac0f7..5bbc2726ec 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
@@ -7,7 +7,7 @@
 from habitat.tasks.rearrange.multi_task.pddl_domain import PddlProblem
 
 
-class HighLevelPolicy:
+class HighLevelPolicy(nn.Module):
     def __init__(
         self,
         config,
@@ -17,6 +17,7 @@ def __init__(
         observation_space: spaces.Space,
         action_space: spaces.Space,
     ):
+        super().__init__()
         self._config = config
         self._pddl_prob = pddl_problem
         self._num_envs = num_envs
@@ -32,7 +33,7 @@ def num_recurrent_layers(self):
         return 0
 
     def parameters(self):
-        return nn.Parameter(torch.zeros((1,), device=self._device))
+        return iter([nn.Parameter(torch.zeros((1,), device=self._device))])
 
     def get_next_skill(
         self,
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
new file mode 100644
index 0000000000..706f2b9a57
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
@@ -0,0 +1,128 @@
+import torch
+import torch.nn.functional as F
+
+from habitat_baselines.common.baseline_registry import baseline_registry
+from habitat_baselines.rl.ddppo.algo.ddppo import DecentralizedDistributedMixin
+from habitat_baselines.rl.ppo import PPO
+from habitat_baselines.utils.common import (
+    LagrangeInequalityCoefficient,
+    inference_mode,
+)
+
+
+@baseline_registry.register_updater
+class HrlPPO(PPO):
+    def _update_from_batch(self, batch, epoch, rollouts, learner_metrics):
+        n_samples = max(batch["loss_mask"].sum(), 1)
+
+        def record_min_mean_max(t: torch.Tensor, prefix: str):
+            for name, op in (
+                ("min", torch.min),
+                ("mean", torch.mean),
+                ("max", torch.max),
+            ):
+                learner_metrics[f"{prefix}_{name}"].append(op(t))
+
+        def reduce_loss(loss):
+            return (loss * batch["loss_mask"]).sum() / n_samples
+
+        self._set_grads_to_none()
+
+        (
+            values,
+            action_log_probs,
+            dist_entropy,
+            _,
+            aux_loss_res,
+        ) = self._evaluate_actions(
+            batch["observations"],
+            batch["recurrent_hidden_states"],
+            batch["prev_actions"],
+            batch["masks"],
+            batch["actions"],
+            batch["rnn_build_seq_info"],
+        )
+
+        ratio = torch.exp(action_log_probs - batch["action_log_probs"])
+
+        surr1 = batch["advantages"] * ratio
+        surr2 = batch["advantages"] * (
+            torch.clamp(
+                ratio,
+                1.0 - self.clip_param,
+                1.0 + self.clip_param,
+            )
+        )
+        action_loss = -torch.min(surr1, surr2)
+        action_loss = reduce_loss(action_loss)
+
+        values = values.float()
+        orig_values = values
+
+        if self.use_clipped_value_loss:
+            delta = values.detach() - batch["value_preds"]
+            value_pred_clipped = batch["value_preds"] + delta.clamp(
+                -self.clip_param, self.clip_param
+            )
+
+            values = torch.where(
+                delta.abs() < self.clip_param,
+                values,
+                value_pred_clipped,
+            )
+
+        value_loss = 0.5 * F.mse_loss(
+            values, batch["returns"], reduction="none"
+        )
+        value_loss = reduce_loss(value_loss)
+
+        all_losses = [
+            self.value_loss_coef * value_loss,
+            action_loss,
+        ]
+
+        dist_entropy = reduce_loss(dist_entropy)
+        if isinstance(self.entropy_coef, float):
+            all_losses.append(-self.entropy_coef * dist_entropy)
+        else:
+            all_losses.append(self.entropy_coef.lagrangian_loss(dist_entropy))
+
+        all_losses.extend(v["loss"] for v in aux_loss_res.values())
+
+        total_loss = torch.stack(all_losses).sum()
+
+        total_loss = self.before_backward(total_loss)
+        total_loss.backward()
+        self.after_backward(total_loss)
+
+        grad_norm = self.before_step()
+        self.optimizer.step()
+        self.after_step()
+
+        with inference_mode():
+            record_min_mean_max(orig_values, "value_pred")
+            record_min_mean_max(ratio, "prob_ratio")
+
+            learner_metrics["value_loss"].append(value_loss)
+            learner_metrics["action_loss"].append(action_loss)
+            learner_metrics["dist_entropy"].append(dist_entropy)
+            if epoch == (self.ppo_epoch - 1):
+                learner_metrics["ppo_fraction_clipped"].append(
+                    (ratio > (1.0 + self.clip_param)).float().mean()
+                    + (ratio < (1.0 - self.clip_param)).float().mean()
+                )
+
+            learner_metrics["grad_norm"].append(grad_norm)
+            if isinstance(self.entropy_coef, LagrangeInequalityCoefficient):
+                learner_metrics["entropy_coef"].append(
+                    self.entropy_coef().detach()
+                )
+
+            for name, res in aux_loss_res.items():
+                for k, v in res.items():
+                    learner_metrics[f"aux_{name}_{k}"].append(v.detach())
+
+
+@baseline_registry.register_updater
+class HrlDDPPO(DecentralizedDistributedMixin, HrlPPO):
+    pass
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
new file mode 100644
index 0000000000..b7750c5493
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Iterator
+
+import torch
+
+from habitat_baselines.common.baseline_registry import baseline_registry
+from habitat_baselines.common.rollout_storage import RolloutStorage
+from habitat_baselines.common.tensor_dict import TensorDict
+
+EPS_PPO = 1e-5
+
+
+@baseline_registry.register_policy
+class HrlRolloutStorage(RolloutStorage):
+    def __init__(self, numsteps, num_envs, *args, **kwargs):
+        super().__init__(numsteps, num_envs, *args, **kwargs)
+        self._num_envs = num_envs
+        self._numsteps = numsteps
+        self._cur_step_idxs = torch.zeros(self._num_envs, dtype=torch.long)
+        self._last_should_inserts = None
+        if self.is_double_buffered:
+            raise ValueError(
+                "HRL storage does not support double buffered sampling"
+            )
+
+    def insert(
+        self,
+        next_observations=None,
+        next_recurrent_hidden_states=None,
+        actions=None,
+        action_log_probs=None,
+        value_preds=None,
+        rewards=None,
+        next_masks=None,
+        buffer_index: int = 0,
+        should_inserts=None,
+    ):
+        next_step = dict(
+            observations=next_observations,
+            recurrent_hidden_states=next_recurrent_hidden_states,
+            prev_actions=actions,
+            masks=next_masks,
+        )
+
+        current_step = dict(
+            actions=actions,
+            action_log_probs=action_log_probs,
+            value_preds=value_preds,
+        )
+
+        next_step = {k: v for k, v in next_step.items() if v is not None}
+        current_step = {k: v for k, v in current_step.items() if v is not None}
+
+        if should_inserts is None:
+            should_inserts = self._last_should_inserts
+        assert should_inserts is not None
+
+        if should_inserts.sum() == 0:
+            return
+
+        env_idxs = torch.arange(self._num_envs)
+        if rewards is not None:
+            # Accumulate rewards between updates.
+            reward_write_idxs = torch.maximum(
+                self._cur_step_idxs - 1, torch.zeros_like(self._cur_step_idxs)
+            )
+            self.buffers["rewards"][reward_write_idxs, env_idxs] += rewards.to(
+                self.device
+            )
+
+        if len(next_step) > 0:
+            self.buffers.set(
+                (
+                    self._cur_step_idxs[should_inserts] + 1,
+                    env_idxs[should_inserts],
+                ),
+                next_step,
+                strict=False,
+            )
+
+        if len(current_step) > 0:
+            self.buffers.set(
+                (
+                    self._cur_step_idxs[should_inserts],
+                    env_idxs[should_inserts],
+                ),
+                current_step,
+                strict=False,
+            )
+        self._last_should_inserts = should_inserts
+
+    def advance_rollout(self, buffer_index: int = 0):
+        self._cur_step_idxs += self._last_should_inserts.long()
+
+        is_past_buffer = self._cur_step_idxs >= self._numsteps
+        if is_past_buffer.sum() > 0:
+            self._cur_step_idxs[is_past_buffer] = self._numsteps - 1
+            env_idxs = torch.arange(self._num_envs)
+            self.buffers["rewards"][
+                self._cur_step_idxs[is_past_buffer], env_idxs[is_past_buffer]
+            ] = 0.0
+
+    def after_update(self):
+        env_idxs = torch.arange(self._num_envs)
+        self.buffers[0] = self.buffers[self._cur_step_idxs, env_idxs]
+        self.buffers["masks"][1:] = False
+        self.buffers["rewards"][1:] = 0.0
+
+        self.current_rollout_step_idxs = [
+            0 for _ in self.current_rollout_step_idxs
+        ]
+        self._cur_step_idxs[:] = 0
+
+    def compute_returns(self, next_value, use_gae, gamma, tau):
+        if not use_gae:
+            raise ValueError("Only GAE is supported with HRL trainer")
+
+        assert isinstance(self.buffers["value_preds"], torch.Tensor)
+        gae = 0.0
+        for step in reversed(range(self._cur_step_idxs.max() - 1)):
+            delta = (
+                self.buffers["rewards"][step]
+                + gamma
+                * self.buffers["value_preds"][step + 1]
+                * self.buffers["masks"][step + 1]
+                - self.buffers["value_preds"][step]
+            )
+            gae = delta + gamma * tau * gae * self.buffers["masks"][step + 1]
+            self.buffers["returns"][step] = (  # type: ignore
+                gae + self.buffers["value_preds"][step]  # type: ignore
+            )
+
+    def recurrent_generator(
+        self, advantages, num_batches
+    ) -> Iterator[TensorDict]:
+        num_environments = advantages.size(1)
+        for inds in torch.randperm(num_environments).chunk(num_batches):
+            batch = self.buffers[0 : self._numsteps, inds]
+            batch["advantages"] = advantages[: self._numsteps, inds]
+            batch["recurrent_hidden_states"] = batch[
+                "recurrent_hidden_states"
+            ][0:1]
+            batch["loss_mask"] = (
+                torch.arange(self._numsteps, device=advantages.device)
+                .view(-1, 1, 1)
+                .repeat(1, len(inds), 1)
+            )
+            for i, env_i in enumerate(inds):
+                # The -1 is to throw out the last transition.
+                batch["loss_mask"][:, i] = (
+                    batch["loss_mask"][:, i] < self._cur_step_idxs[env_i] - 1
+                )
+
+            yield batch.map(lambda v: v.flatten(0, 1))
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
index e58a68daad..374e17494c 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
@@ -124,7 +124,7 @@ def _is_skill_done(
             prev_pos = self._prev_pos[batch_i]
             if prev_pos is not None:
                 movement = torch.linalg.norm(prev_pos - cur_pos[i])
-                ret[i] = movement < self._config.STOP_THRESH
+                ret[i] = movement < self._config.stop_thresh
             self._prev_pos[batch_i] = cur_pos[i]
 
         return ret
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index b0fd2ddb42..4e95a29221 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -44,7 +44,9 @@ def __init__(
             )
         else:
             self._pddl_ac_start = None
-        self._delay_term = [None for _ in range(self._batch_size)]
+        self._delay_term: List[Optional[bool]] = [
+            None for _ in range(self._batch_size)
+        ]
 
         self._grip_ac_idx = 0
         found_grip = False
@@ -104,6 +106,10 @@ def _apply_postcond(
         action = self._pddl_problem.actions[skill_name]
 
         entities = [self._pddl_problem.get_entity(x) for x in skill_args]
+        if self._pddl_ac_start is None:
+            raise ValueError(
+                "Apply post cond not supported when pddl action not in action space"
+            )
 
         ac_idx = self._pddl_ac_start
         found = False
@@ -139,6 +145,7 @@ def should_terminate(
         rnn_hidden_states,
         prev_actions,
         masks,
+        actions,
         hl_says_term,
         batch_idx: List[int],
         skill_name: List[str],
@@ -156,37 +163,43 @@ def should_terminate(
                 observations,
             )
 
+        cur_skill_step = self._cur_skill_step[batch_idx]
         bad_terminate = torch.zeros(
-            self._cur_skill_step.shape,
-            device=self._cur_skill_step.device,
+            cur_skill_step.shape,
+            dtype=torch.bool,
+        )
+
+        bad_terminate = torch.zeros(
+            cur_skill_step.shape,
+            device=cur_skill_step.device,
             dtype=torch.bool,
         )
         if self._config.max_skill_steps > 0:
-            over_max_len = self._cur_skill_step > self._config.max_skill_steps
+            over_max_len = cur_skill_step > self._config.max_skill_steps
             if self._config.force_end_on_timeout:
                 bad_terminate = over_max_len
             else:
                 is_skill_done = is_skill_done | over_max_len
 
-        # for i, env_i in enumerate(batch_idx):
-        #     if self._delay_term[env_i]:
-        #         self._delay_term[env_i] = False
-        #         is_skill_done[i] = 1.0
-        #     elif (
-        #         self._config.apply_postconds
-        #         and is_skill_done[i] == 1.0
-        #         and hl_says_term[i] == 0.0
-        #     ):
-        #         actions = self._apply_postcond(
-        #             actions, log_info, skill_name[i], env_i, i
-        #         )
-        #         self._delay_term[env_i] = True
-        #         is_skill_done[i] = 0.0
+        for i, env_i in enumerate(batch_idx):
+            if self._delay_term[env_i]:
+                self._delay_term[env_i] = False
+                is_skill_done[i] = 1.0
+            elif (
+                self._config.apply_postconds
+                and is_skill_done[i] == 1.0
+                and hl_says_term[i] == 0.0
+            ):
+                actions = self._apply_postcond(
+                    actions, log_info, skill_name[i], env_i, i
+                )
+                self._delay_term[env_i] = True
+                is_skill_done[i] = 0.0
         is_skill_done |= hl_says_term
 
         if bad_terminate.sum() > 0:
             self._internal_log(
-                f"Bad terminating due to timeout {self._cur_skill_step}, {bad_terminate}",
+                f"Bad terminating due to timeout {cur_skill_step}, {bad_terminate}",
                 observations,
             )
 
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/policy.py b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
index bfb8ac6f4a..22b7d95205 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/policy.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
@@ -39,6 +39,7 @@ class PolicyAction:
     action_log_probs: Optional[torch.Tensor] = None
     take_actions: Optional[torch.Tensor] = None
     policy_info: Optional[List[Dict[str, Any]]] = None
+    should_inserts: Optional[torch.Tensor] = None
 
     def write_action(self, write_idx, write_action):
         self.actions[:, write_idx] = write_action
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/ppo.py b/habitat-baselines/habitat_baselines/rl/ppo/ppo.py
index 20ccfe83aa..7c8e816898 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/ppo.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/ppo.py
@@ -6,7 +6,7 @@
 
 import collections
 import inspect
-from typing import Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -15,6 +15,7 @@
 from torch import Tensor
 
 from habitat.utils import profiling_wrapper
+from habitat_baselines.common.baseline_registry import baseline_registry
 from habitat_baselines.common.rollout_storage import RolloutStorage
 from habitat_baselines.rl.ppo.policy import NetPolicy
 from habitat_baselines.rl.ver.ver_rollout_storage import VERRolloutStorage
@@ -26,6 +27,7 @@
 EPS_PPO = 1e-5
 
 
+@baseline_registry.register_updater
 class PPO(nn.Module):
     entropy_coef: Union[float, LagrangeInequalityCoefficient]
 
@@ -155,14 +157,10 @@ def _set_grads_to_none(self):
             for p in pg["params"]:
                 p.grad = None
 
-    def update(
-        self,
-        rollouts: RolloutStorage,
-    ) -> Dict[str, float]:
-
-        advantages = self.get_advantages(rollouts)
-
-        learner_metrics = collections.defaultdict(list)
+    def _update_from_batch(self, batch, epoch, rollouts, learner_metrics):
+        """
+        Performs a gradient update from the minibatch.
+        """
 
         def record_min_mean_max(t: torch.Tensor, prefix: str):
             for name, op in (
@@ -172,145 +170,148 @@ def record_min_mean_max(t: torch.Tensor, prefix: str):
             ):
                 learner_metrics[f"{prefix}_{name}"].append(op(t))
 
-        for epoch in range(self.ppo_epoch):
-            profiling_wrapper.range_push("PPO.update epoch")
-            data_generator = rollouts.recurrent_generator(
-                advantages, self.num_mini_batch
+        self._set_grads_to_none()
+
+        (
+            values,
+            action_log_probs,
+            dist_entropy,
+            _,
+            aux_loss_res,
+        ) = self._evaluate_actions(
+            batch["observations"],
+            batch["recurrent_hidden_states"],
+            batch["prev_actions"],
+            batch["masks"],
+            batch["actions"],
+            batch["rnn_build_seq_info"],
+        )
+
+        ratio = torch.exp(action_log_probs - batch["action_log_probs"])
+
+        surr1 = batch["advantages"] * ratio
+        surr2 = batch["advantages"] * (
+            torch.clamp(
+                ratio,
+                1.0 - self.clip_param,
+                1.0 + self.clip_param,
             )
+        )
+        action_loss = -torch.min(surr1, surr2)
 
-            for _bid, batch in enumerate(data_generator):
-                self._set_grads_to_none()
-
-                (
-                    values,
-                    action_log_probs,
-                    dist_entropy,
-                    _,
-                    aux_loss_res,
-                ) = self._evaluate_actions(
-                    batch["observations"],
-                    batch["recurrent_hidden_states"],
-                    batch["prev_actions"],
-                    batch["masks"],
-                    batch["actions"],
-                    batch["rnn_build_seq_info"],
-                )
+        values = values.float()
+        orig_values = values
 
-                ratio = torch.exp(action_log_probs - batch["action_log_probs"])
+        if self.use_clipped_value_loss:
+            delta = values.detach() - batch["value_preds"]
+            value_pred_clipped = batch["value_preds"] + delta.clamp(
+                -self.clip_param, self.clip_param
+            )
 
-                surr1 = batch["advantages"] * ratio
-                surr2 = batch["advantages"] * (
-                    torch.clamp(
-                        ratio,
-                        1.0 - self.clip_param,
-                        1.0 + self.clip_param,
-                    )
-                )
-                action_loss = -torch.min(surr1, surr2)
+            values = torch.where(
+                delta.abs() < self.clip_param,
+                values,
+                value_pred_clipped,
+            )
 
-                values = values.float()
-                orig_values = values
+        value_loss = 0.5 * F.mse_loss(
+            values, batch["returns"], reduction="none"
+        )
 
-                if self.use_clipped_value_loss:
-                    delta = values.detach() - batch["value_preds"]
-                    value_pred_clipped = batch["value_preds"] + delta.clamp(
-                        -self.clip_param, self.clip_param
-                    )
+        if "is_coeffs" in batch:
+            assert isinstance(batch["is_coeffs"], torch.Tensor)
+            ver_is_coeffs = batch["is_coeffs"].clamp(max=1.0)
+            mean_fn = lambda t: torch.mean(ver_is_coeffs * t)
+        else:
+            mean_fn = torch.mean
 
-                    values = torch.where(
-                        delta.abs() < self.clip_param,
-                        values,
-                        value_pred_clipped,
-                    )
+        action_loss, value_loss, dist_entropy = map(
+            mean_fn,
+            (action_loss, value_loss, dist_entropy),
+        )
 
-                value_loss = 0.5 * F.mse_loss(
-                    values, batch["returns"], reduction="none"
-                )
+        all_losses = [
+            self.value_loss_coef * value_loss,
+            action_loss,
+        ]
 
-                if "is_coeffs" in batch:
-                    assert isinstance(batch["is_coeffs"], torch.Tensor)
-                    ver_is_coeffs = batch["is_coeffs"].clamp(max=1.0)
-                    mean_fn = lambda t: torch.mean(ver_is_coeffs * t)
-                else:
-                    mean_fn = torch.mean
+        if isinstance(self.entropy_coef, float):
+            all_losses.append(-self.entropy_coef * dist_entropy)
+        else:
+            all_losses.append(self.entropy_coef.lagrangian_loss(dist_entropy))
 
-                action_loss, value_loss, dist_entropy = map(
-                    mean_fn,
-                    (action_loss, value_loss, dist_entropy),
-                )
+        all_losses.extend(v["loss"] for v in aux_loss_res.values())
 
-                all_losses = [
-                    self.value_loss_coef * value_loss,
-                    action_loss,
-                ]
+        total_loss = torch.stack(all_losses).sum()
 
-                if isinstance(self.entropy_coef, float):
-                    all_losses.append(-self.entropy_coef * dist_entropy)
-                else:
-                    all_losses.append(
-                        self.entropy_coef.lagrangian_loss(dist_entropy)
-                    )
+        total_loss = self.before_backward(total_loss)
+        total_loss.backward()
+        self.after_backward(total_loss)
 
-                all_losses.extend(v["loss"] for v in aux_loss_res.values())
+        grad_norm = self.before_step()
+        self.optimizer.step()
+        self.after_step()
 
-                total_loss = torch.stack(all_losses).sum()
+        with inference_mode():
+            if "is_coeffs" in batch:
+                record_min_mean_max(batch["is_coeffs"], "ver_is_coeffs")
+            record_min_mean_max(orig_values, "value_pred")
+            record_min_mean_max(ratio, "prob_ratio")
+
+            learner_metrics["value_loss"].append(value_loss)
+            learner_metrics["action_loss"].append(action_loss)
+            learner_metrics["dist_entropy"].append(dist_entropy)
+            if epoch == (self.ppo_epoch - 1):
+                learner_metrics["ppo_fraction_clipped"].append(
+                    (ratio > (1.0 + self.clip_param)).float().mean()
+                    + (ratio < (1.0 - self.clip_param)).float().mean()
+                )
 
-                total_loss = self.before_backward(total_loss)
-                total_loss.backward()
-                self.after_backward(total_loss)
+            learner_metrics["grad_norm"].append(grad_norm)
+            if isinstance(self.entropy_coef, LagrangeInequalityCoefficient):
+                learner_metrics["entropy_coef"].append(
+                    self.entropy_coef().detach()
+                )
 
-                grad_norm = self.before_step()
-                self.optimizer.step()
-                self.after_step()
+            for name, res in aux_loss_res.items():
+                for k, v in res.items():
+                    learner_metrics[f"aux_{name}_{k}"].append(v.detach())
 
-                with inference_mode():
-                    if "is_coeffs" in batch:
-                        record_min_mean_max(
-                            batch["is_coeffs"], "ver_is_coeffs"
-                        )
-                    record_min_mean_max(orig_values, "value_pred")
-                    record_min_mean_max(ratio, "prob_ratio")
-
-                    learner_metrics["value_loss"].append(value_loss)
-                    learner_metrics["action_loss"].append(action_loss)
-                    learner_metrics["dist_entropy"].append(dist_entropy)
-                    if epoch == (self.ppo_epoch - 1):
-                        learner_metrics["ppo_fraction_clipped"].append(
-                            (ratio > (1.0 + self.clip_param)).float().mean()
-                            + (ratio < (1.0 - self.clip_param)).float().mean()
-                        )
+            if "is_stale" in batch:
+                assert isinstance(batch["is_stale"], torch.Tensor)
+                learner_metrics["fraction_stale"].append(
+                    batch["is_stale"].float().mean()
+                )
 
-                    learner_metrics["grad_norm"].append(grad_norm)
-                    if isinstance(
-                        self.entropy_coef, LagrangeInequalityCoefficient
-                    ):
-                        learner_metrics["entropy_coef"].append(
-                            self.entropy_coef().detach()
-                        )
+            if isinstance(rollouts, VERRolloutStorage):
+                assert isinstance(batch["policy_version"], torch.Tensor)
+                record_min_mean_max(
+                    (
+                        rollouts.current_policy_version
+                        - batch["policy_version"]
+                    ).float(),
+                    "policy_version_difference",
+                )
 
-                    for name, res in aux_loss_res.items():
-                        for k, v in res.items():
-                            learner_metrics[f"aux_{name}_{k}"].append(
-                                v.detach()
-                            )
+    def update(
+        self,
+        rollouts: RolloutStorage,
+    ) -> Dict[str, float]:
 
-                    if "is_stale" in batch:
-                        assert isinstance(batch["is_stale"], torch.Tensor)
-                        learner_metrics["fraction_stale"].append(
-                            batch["is_stale"].float().mean()
-                        )
+        advantages = self.get_advantages(rollouts)
 
-                    if isinstance(rollouts, VERRolloutStorage):
-                        assert isinstance(
-                            batch["policy_version"], torch.Tensor
-                        )
-                        record_min_mean_max(
-                            (
-                                rollouts.current_policy_version
-                                - batch["policy_version"]
-                            ).float(),
-                            "policy_version_difference",
-                        )
+        learner_metrics: Dict[str, List[Any]] = collections.defaultdict(list)
+
+        for epoch in range(self.ppo_epoch):
+            profiling_wrapper.range_push("PPO.update epoch")
+            data_generator = rollouts.recurrent_generator(
+                advantages, self.num_mini_batch
+            )
+
+            for _bid, batch in enumerate(data_generator):
+                self._update_from_batch(
+                    batch, epoch, rollouts, learner_metrics
+                )
 
             profiling_wrapper.range_pop()  # PPO.update epoch
 
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
index 42bb765815..a5a87c1b6b 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
@@ -36,12 +36,14 @@
     apply_obs_transforms_obs_space,
     get_active_obs_transforms,
 )
-from habitat_baselines.common.rollout_storage import RolloutStorage
+from habitat_baselines.common.rollout_storage import (  # noqa: F401.
+    RolloutStorage,
+)
 from habitat_baselines.common.tensorboard_utils import (
     TensorboardWriter,
     get_writer,
 )
-from habitat_baselines.rl.ddppo.algo import DDPPO
+from habitat_baselines.rl.ddppo.algo import DDPPO  # noqa: F401.
 from habitat_baselines.rl.ddppo.ddp_utils import (
     EXIT,
     get_distrib_size,
@@ -59,7 +61,7 @@
 from habitat_baselines.rl.hrl.hierarchical_policy import (  # noqa: F401.
     HierarchicalPolicy,
 )
-from habitat_baselines.rl.ppo import PPO
+from habitat_baselines.rl.ppo import PPO  # noqa: F401.
 from habitat_baselines.rl.ppo.policy import NetPolicy
 from habitat_baselines.utils.common import (
     batch_obs,
@@ -189,9 +191,14 @@ def _setup_actor_critic_agent(self, ppo_cfg: "DictConfig") -> None:
             nn.init.orthogonal_(self.actor_critic.critic.fc.weight)
             nn.init.constant_(self.actor_critic.critic.fc.bias, 0)
 
-        self.agent = (DDPPO if self._is_distributed else PPO).from_config(
-            self.actor_critic, ppo_cfg
-        )
+        if self._is_distributed:
+            agent_cls = baseline_registry.get_updater(
+                self.config.distrib_updater_name
+            )
+        else:
+            agent_cls = baseline_registry.get_updater(self.config.updater_name)
+
+        self.agent = agent_cls.from_config(self.actor_critic, ppo_cfg)
 
     def _init_envs(self, config=None, is_eval: bool = False):
         if config is None:
@@ -329,13 +336,16 @@ def _init_train(self, resume_state=None):
 
         self._nbuffers = 2 if ppo_cfg.use_double_buffered_sampler else 1
 
-        self.rollouts = RolloutStorage(
+        rollouts_cls = baseline_registry.get_storage(
+            self.config.rollout_storage
+        )
+        self.rollouts = rollouts_cls(
             ppo_cfg.num_steps,
             self.envs.num_envs,
             obs_space,
             self.policy_action_space,
             ppo_cfg.hidden_size,
-            num_recurrent_layers=self.actor_critic.net.num_recurrent_layers,
+            num_recurrent_layers=self.actor_critic.num_recurrent_layers,
             is_double_buffered=ppo_cfg.use_double_buffered_sampler,
             action_shape=action_shape,
             discrete_actions=discrete_actions,
@@ -505,6 +515,7 @@ def _compute_actions_and_step_envs(self, buffer_index: int = 0):
             action_log_probs=action_data.action_log_probs,
             value_preds=action_data.values,
             buffer_index=buffer_index,
+            should_inserts=action_data.should_inserts,
         )
 
     def _collect_environment_result(self, buffer_index: int = 0):
diff --git a/habitat-lab/habitat/config/default_structured_configs.py b/habitat-lab/habitat/config/default_structured_configs.py
index ccbb1aa984..f54e12a7e1 100644
--- a/habitat-lab/habitat/config/default_structured_configs.py
+++ b/habitat-lab/habitat/config/default_structured_configs.py
@@ -129,6 +129,11 @@ class RearrangeStopActionConfig(ActionConfig):
     type: str = "RearrangeStopAction"
 
 
+@dataclass
+class PddlApplyActionConfig(ActionConfig):
+    type: str = "PddlApplyAction"
+
+
 @dataclass
 class OracleNavActionConfig(ActionConfig):
     """
@@ -143,6 +148,9 @@ class OracleNavActionConfig(ActionConfig):
     forward_velocity: float = 1.0
     turn_thresh: float = 0.1
     dist_thresh: float = 0.2
+    stop_thresh: float = 0.001
+    spawn_max_dist_to_obj: float = 2.0
+    num_spawn_attempts: int = 200
     lin_speed: float = 10.0
     ang_speed: float = 10.0
     allow_dyn_slide: bool = True
@@ -723,7 +731,7 @@ class TaskConfig(HabitatBaseConfig):
     # Spawn parameters
     physics_stability_steps: int = 1
     num_spawn_attempts: int = 200
-    spawn_max_dists_to_obj: float = 2.0
+    spawn_max_dist_to_obj: float = 2.0
     base_angle_noise: float = 0.523599
     # EE sample parameters
     ee_sample_factor: float = 0.2
@@ -1186,6 +1194,12 @@ class HabitatConfig(HabitatBaseConfig):
     name="oracle_nav_action",
     node=OracleNavActionConfig,
 )
+cs.store(
+    package="habitat.task.actions.pddl_apply_action",
+    group="habitat/task/actions",
+    name="pddl_apply_action",
+    node=PddlApplyActionConfig,
+)
 
 # Dataset Config Schema
 cs.store(
@@ -1313,6 +1327,12 @@ class HabitatConfig(HabitatBaseConfig):
     name="instance_imagegoal_hfov_sensor",
     node=InstanceImageGoalHFOVSensorConfig,
 )
+cs.store(
+    package="habitat.task.lab_sensors.localization_sensor",
+    group="habitat/task/lab_sensors",
+    name="localization_sensor",
+    node=LocalizationSensorConfig,
+)
 cs.store(
     package="habitat.task.lab_sensors.target_start_sensor",
     group="habitat/task/lab_sensors",
diff --git a/habitat-lab/habitat/config/habitat/task/rearrange/rearrange.yaml b/habitat-lab/habitat/config/habitat/task/rearrange/rearrange.yaml
index 13dcf40da7..9f3b3f3608 100644
--- a/habitat-lab/habitat/config/habitat/task/rearrange/rearrange.yaml
+++ b/habitat-lab/habitat/config/habitat/task/rearrange/rearrange.yaml
@@ -27,6 +27,7 @@ defaults:
     - relative_resting_pos_sensor
     - target_start_gps_compass_sensor
     - target_goal_gps_compass_sensor
+    - localization_sensor
   - _self_
 
 type: RearrangeCompositeTask-v0
diff --git a/habitat-lab/habitat/config/habitat/task/rearrange/rearrange_easy.yaml b/habitat-lab/habitat/config/habitat/task/rearrange/rearrange_easy.yaml
index 793fbc1847..c138ca2238 100644
--- a/habitat-lab/habitat/config/habitat/task/rearrange/rearrange_easy.yaml
+++ b/habitat-lab/habitat/config/habitat/task/rearrange/rearrange_easy.yaml
@@ -28,6 +28,7 @@ defaults:
     - end_effector_sensor
     - target_start_gps_compass_sensor
     - target_goal_gps_compass_sensor
+    - localization_sensor
   - _self_
 
 type: RearrangeCompositeTask-v0
diff --git a/habitat-lab/habitat/tasks/rearrange/actions/pddl_actions.py b/habitat-lab/habitat/tasks/rearrange/actions/pddl_actions.py
index 279bc079f7..df54c59593 100644
--- a/habitat-lab/habitat/tasks/rearrange/actions/pddl_actions.py
+++ b/habitat-lab/habitat/tasks/rearrange/actions/pddl_actions.py
@@ -8,7 +8,6 @@
 from habitat.core.registry import registry
 from habitat.sims.habitat_simulator.actions import HabitatSimActions
 from habitat.tasks.rearrange.actions.grip_actions import RobotAction
-from habitat.tasks.rearrange.utils import rearrange_logger
 
 
 @registry.register_task_action
@@ -38,7 +37,10 @@ def action_space(self):
             {
                 self._action_arg_prefix
                 + "pddl_action": spaces.Box(
-                    shape=(action_n_args,), low=-1, high=1, dtype=np.float32
+                    shape=(action_n_args,),
+                    low=np.finfo(np.float32).min,
+                    high=np.finfo(np.float32).max,
+                    dtype=np.float32,
                 )
             }
         )
@@ -49,6 +51,7 @@ def was_prev_action_invalid(self):
 
     def reset(self, *args, **kwargs):
         self._was_prev_action_invalid = False
+        self._prev_action = None
 
     def get_pddl_action_start(self, action_id: int) -> int:
         start_idx = 0
@@ -56,10 +59,8 @@ def get_pddl_action_start(self, action_id: int) -> int:
             start_idx += action.n_args
         return start_idx
 
-    def step(self, *args, is_last_action, **kwargs):
-        apply_pddl_action = kwargs[self._action_arg_prefix + "pddl_action"]
+    def _apply_action(self, apply_pddl_action):
         cur_i = 0
-        self._was_prev_action_invalid = False
         for action in self._action_ordering:
             action_part = apply_pddl_action[cur_i : cur_i + action.n_args][:]
             if sum(action_part) > 0:
@@ -71,26 +72,31 @@ def step(self, *args, is_last_action, **kwargs):
                         raise ValueError(
                             f"Got invalid action value < 0 in {action_part} with action {action}"
                         )
-                rearrange_logger.debug(f"Got action part {real_action_idxs}")
 
                 param_values = [
                     self._entities_list[i] for i in real_action_idxs
                 ]
 
-                apply_action = action.copy()
+                apply_action = action.clone()
                 apply_action.set_param_values(param_values)
+                self._prev_action = apply_action
                 if self._task.pddl_problem.is_expr_true(apply_action.precond):
-                    rearrange_logger.debug(
-                        f"Applying action {action} with obj args {param_values}"
-                    )
                     self._task.pddl_problem.apply_action(apply_action)
                 else:
-                    rearrange_logger.debug(
-                        f"Preconds not satisfied for: action {action} with obj args {param_values}"
-                    )
                     self._was_prev_action_invalid = True
 
             cur_i += action.n_args
+
+    def step(self, *args, is_last_action, **kwargs):
+        self._prev_action = None
+        apply_pddl_action = kwargs[self._action_arg_prefix + "pddl_action"]
+        self._was_prev_action_invalid = False
+        inputs_outside = any(
+            a < 0 or a > len(self._entities_list) for a in apply_pddl_action
+        )
+        if not inputs_outside:
+            self._apply_action(apply_pddl_action)
+
         if is_last_action:
             return self._sim.step(HabitatSimActions.arm_action)
         else:
diff --git a/habitat-lab/habitat/tasks/rearrange/sub_tasks/pick_task.py b/habitat-lab/habitat/tasks/rearrange/sub_tasks/pick_task.py
index 23eebb33a3..f1f396d169 100644
--- a/habitat-lab/habitat/tasks/rearrange/sub_tasks/pick_task.py
+++ b/habitat-lab/habitat/tasks/rearrange/sub_tasks/pick_task.py
@@ -58,7 +58,7 @@ def _gen_start_pos(self, sim, episode, sel_idx):
         start_pos, angle_to_obj, was_succ = get_robot_spawns(
             snap_pos,
             self._config.base_angle_noise,
-            self._config.spawn_max_dists_to_obj,
+            self._config.spawn_max_dist_to_obj,
             sim,
             self._config.num_spawn_attempts,
             self._config.physics_stability_steps,
diff --git a/test/test_baseline_training.py b/test/test_baseline_training.py
index eacd10759d..43a72477f7 100644
--- a/test/test_baseline_training.py
+++ b/test/test_baseline_training.py
@@ -154,19 +154,24 @@ def test_trainers(config_path, num_updates, overrides, trainer_name):
     [
         (
             "habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml",
-            "eval",
-            "ppo",
-        ),
-        (
-            "habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml",
-            "eval",
-            "ppo",
-        ),
-        (
-            "habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml",
-            "eval",
+            "train",
             "ppo",
         ),
+        # (
+        #     "habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml",
+        #     "eval",
+        #     "ppo",
+        # ),
+        # (
+        #     "habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml",
+        #     "eval",
+        #     "ppo",
+        # ),
+        # (
+        #     "habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml",
+        #     "eval",
+        #     "ppo",
+        # ),
     ],
 )
 def test_hrl(config_path, mode, trainer_name):
@@ -180,6 +185,7 @@ def test_hrl(config_path, mode, trainer_name):
         [
             "habitat_baselines.num_updates=1",
             "habitat_baselines.eval.split=minival",
+            "habitat.dataset.split=minival",
             "habitat_baselines.total_num_steps=-1.0",
             "habitat_baselines.test_episode_count=1",
             "habitat_baselines.checkpoint_folder=data/test_checkpoints/test_training",

From daa84dbec3250dfc2be3622d4adfa3a40f1e99d4 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Tue, 27 Dec 2022 06:36:32 +0100
Subject: [PATCH 03/48] Fixed config setup

---
 .../habitat_baselines/rl/hrl/hrl_rollout_storage.py       | 2 +-
 habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
index b7750c5493..7d728afdfa 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
@@ -15,7 +15,7 @@
 EPS_PPO = 1e-5
 
 
-@baseline_registry.register_policy
+@baseline_registry.register_storage
 class HrlRolloutStorage(RolloutStorage):
     def __init__(self, numsteps, num_envs, *args, **kwargs):
         super().__init__(numsteps, num_envs, *args, **kwargs)
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
index a5a87c1b6b..fdcd9d33ca 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
@@ -193,10 +193,12 @@ def _setup_actor_critic_agent(self, ppo_cfg: "DictConfig") -> None:
 
         if self._is_distributed:
             agent_cls = baseline_registry.get_updater(
-                self.config.distrib_updater_name
+                self.config.habitat_baselines.distrib_updater_name
             )
         else:
-            agent_cls = baseline_registry.get_updater(self.config.updater_name)
+            agent_cls = baseline_registry.get_updater(
+                self.config.habitat_baselines.updater_name
+            )
 
         self.agent = agent_cls.from_config(self.actor_critic, ppo_cfg)
 
@@ -337,7 +339,7 @@ def _init_train(self, resume_state=None):
         self._nbuffers = 2 if ppo_cfg.use_double_buffered_sampler else 1
 
         rollouts_cls = baseline_registry.get_storage(
-            self.config.rollout_storage
+            self.config.habitat_baselines.rollout_storage
         )
         self.rollouts = rollouts_cls(
             ppo_cfg.num_steps,

From 7ae2c6b1de4e622f2a0dc8259df5124d70af021f Mon Sep 17 00:00:00 2001
From: akshararai <akshara.rai@gmail.com>
Date: Fri, 6 Jan 2023 18:40:31 -0500
Subject: [PATCH 04/48] Train hl modif (#1057)

* separate config policy

* udpate config

* renamed hierarchical_srl_learned_nav.yaml to hierarchical_srl.yaml since that already implies a learned nav

* renamed yaml files to follow convention of _tp_ for task planner, _srl_ for learned skills, _onav_ for oracle nav

* order of parameters in config to avoid overwite

* added documentation on how to run the HRL policy, removed auto_name, not needed in this config

* remove empty file

* fixed the yaml linked in the readme

* updated the description of the config fike

* Delete tp_noop_onav.yaml

don't need this file anymore, added readme instructions to do this by changing hydra parameters.

* setting the agent back to depth_head_agent as _vis slows down the training. Apologies

* removed one config file, since it seemed like mostly a repeat of the tp config

* cleaned up config files by removing reduntant parameters

* renamed monolithic_pointnav to be monolithic

* fixed the comment

* added localization_sensor to all task lab_sensors

Co-authored-by: Xavier Puig <xavierpuig@fb.com>
---
 habitat-baselines/habitat_baselines/README.md |  25 +++++
 .../rl/policy/hierarchical_srl_onav.yaml      |  77 +++++++++++++
 .../rl/policy/hierarchical_tp_noop_onav.yaml  |  72 ++++++++++++
 .../rl/policy/hierarchical_tp_srl.yaml        |  71 ++++++++++++
 .../rl/policy/monolithic.yaml                 |   4 +
 .../config/rearrange/rl_hl_srl_onav.yaml      | 103 +-----------------
 .../config/rearrange/rl_rearrange.yaml        |   6 +-
 .../config/rearrange/rl_rearrange_easy.yaml   |   6 +-
 .../config/rearrange/rl_skill.yaml            |   4 +-
 .../config/rearrange/tp_srl.yaml              |  85 +--------------
 .../config/rearrange/tp_srl_oracle_nav.yaml   |  85 +--------------
 .../task/rearrange/prepare_groceries.yaml     |   1 +
 .../rearrange/rearrange_easy_multi_agent.yaml |   1 +
 .../habitat/task/rearrange/set_table.yaml     |   1 +
 .../habitat/task/rearrange/tidy_house.yaml    |   1 +
 15 files changed, 263 insertions(+), 279 deletions(-)
 create mode 100644 habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_srl_onav.yaml
 create mode 100644 habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_noop_onav.yaml
 create mode 100644 habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_srl.yaml
 create mode 100644 habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/monolithic.yaml

diff --git a/habitat-baselines/habitat_baselines/README.md b/habitat-baselines/habitat_baselines/README.md
index 85e89065d6..e2e06f258d 100644
--- a/habitat-baselines/habitat_baselines/README.md
+++ b/habitat-baselines/habitat_baselines/README.md
@@ -53,6 +53,31 @@ To use them download pre-trained pytorch models from [link](https://dl.fbaipubli
 The `habitat_baselines/config/pointnav/ppo_pointnav.yaml` config has better hyperparameters for large scale training and loads the [Gibson PointGoal Navigation Dataset](/README.md#datasets) instead of the test scenes.
 Change the `/benchmark/nav/pointnav: pointnav_gibson` in `habitat_baselines/config/pointnav/ppo_pointnav.yaml` to `/benchmark/nav/pointnav: pointnav_mp3d` in the defaults list for training on [MatterPort3D PointGoal Navigation Dataset](/README.md#datasets).
 
+### Hierarchical Reinforcement Learning (HRL)
+
+We provide a two-layer hierarchical policy class, consisting of a low-level skill that moves the robot, and a high-level policy that reasons about which low-level skill to use in the current state. This can be especially powerful in long-horizon mobile manipulation tasks, like those introduced in [Habitat2.0](https://arxiv.org/abs/2106.14405). Both the low- and high- level can be either learned or an oracle. For oracle high-level we use [PDDL] (https://planning.wiki/guide/whatis/pddl), and for oracle low-level we use instantaneous transitions, with the environment set to the final desired state. Additionally, for navigation, we provide an oracle navigation skill that uses A-star and the map of the environment to move the robot to its goal.
+
+To run the following examples, you need the [ReplicaCAD dataset](https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md#replicacad). 
+
+To train a high-level policy, while using pre-learned low-level skills (SRL baseline from [Habitat2.0](https://arxiv.org/abs/2106.14405)), you can run:
+
+```bash
+python -u habitat_baselines/run.py \
+  --exp-config habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml \
+  --run-type train
+```
+To run a rearrangement episode with oracle in both low- and high-level, you can run:
+
+```bash
+python -u habitat_baselines/run.py \
+  --exp-config habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml \
+  --run-type eval \
+  habitat_baselines/rl/policy=hierarchical_tp_noop_onav
+```
+
+To change the task (like set table) that you train your skills on, you can change the line `/habitat/task/rearrange: rearrange_easy` to `/habitat/task/rearrange: set_table` in the defaults of your config.
+
+
 ### Classic
 
 **SLAM based**
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_srl_onav.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_srl_onav.yaml
new file mode 100644
index 0000000000..d92283dbc2
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_srl_onav.yaml
@@ -0,0 +1,77 @@
+name: "HierarchicalPolicy"
+obs_transforms:
+  add_virtual_keys:
+    virtual_keys:
+      "goal_to_agent_gps_compass": 2
+hierarchical_policy:
+  high_level_policy:
+    name: "NeuralHighLevelPolicy"
+    allow_other_place: False
+    hidden_dim: 512
+    use_rnn: True
+    rnn_type: 'LSTM'
+    backbone: resnet18
+    normalize_visual_inputs: False
+    num_rnn_layers: 2
+    policy_input_keys:
+      - "robot_head_depth"
+  defined_skills:
+    open_cab:
+      skill_name: "ArtObjSkillPolicy"
+      name: "PointNavResNetPolicy"
+      load_ckpt_file: "data/models/open_cab.pth"
+
+    open_fridge:
+      skill_name: "ArtObjSkillPolicy"
+      load_ckpt_file: "data/models/open_fridge.pth"
+
+    close_cab:
+      skill_name: "ArtObjSkillPolicy"
+      obs_skill_inputs: ["obj_start_sensor"]
+      load_ckpt_file: "data/models/close_cab.pth"
+
+    close_fridge:
+      skill_name: "ArtObjSkillPolicy"
+      obs_skill_inputs: ["obj_start_sensor"]
+      load_ckpt_file: "data/models/close_fridge.pth"
+
+    pick:
+      skill_name: "PickSkillPolicy"
+      name: "PointNavResNetPolicy"
+      obs_skill_inputs: ["obj_start_sensor"]
+      load_ckpt_file: "data/models/pick.pth"
+
+    place:
+      skill_name: "PlaceSkillPolicy"
+      name: "PointNavResNetPolicy"
+      obs_skill_inputs: ["obj_goal_sensor"]
+      load_ckpt_file: "data/models/place.pth"
+
+    wait_skill:
+      skill_name: "WaitSkillPolicy"
+      max_skill_steps: -1
+      force_end_on_timeout: False
+
+    oracle_nav:
+      skill_name: "OracleNavPolicy"
+      obs_skill_inputs: ["obj_start_sensor", "abs_obj_start_sensor", "obj_goal_sensor", "abs_obj_goal_sensor"]
+      max_skill_steps: 300
+
+    reset_arm_skill:
+      skill_name: "ResetArmSkill"
+      max_skill_steps: 50
+      reset_joint_state: [-4.50e-01, -1.08e00, 9.95e-02, 9.38e-01, -7.88e-04, 1.57e00, 4.62e-03]
+      force_end_on_timeout: False
+
+  use_skills:
+    # Uncomment if you are also using these skills
+    # open_cab: "open_cab"
+    # open_fridge: "open_fridge"
+    # close_cab: "open_cab"
+    # close_fridge: "open_fridge"
+    pick: "pick"
+    place: "place"
+    nav: "oracle_nav"
+    nav_to_receptacle: "oracle_nav"
+    wait: "wait_skill"
+    reset_arm: "reset_arm_skill"
\ No newline at end of file
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_noop_onav.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_noop_onav.yaml
new file mode 100644
index 0000000000..d090a528c7
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_noop_onav.yaml
@@ -0,0 +1,72 @@
+
+name: "HierarchicalPolicy"
+obs_transforms:
+  add_virtual_keys:
+    virtual_keys:
+      "goal_to_agent_gps_compass": 2
+hierarchical_policy:
+  high_level_policy:
+    name: "FixedHighLevelPolicy"
+    add_arm_rest: True
+  
+  defined_skills:
+    open_cab:
+      skill_name: "NoopSkillPolicy"
+      max_skill_steps: 1
+      apply_postconds: True
+
+    open_fridge:
+      skill_name: "NoopSkillPolicy"
+      max_skill_steps: 1
+      apply_postconds: True
+
+    close_cab:
+      skill_name: "NoopSkillPolicy"
+      obs_skill_inputs: ["obj_start_sensor"]
+      max_skill_steps: 1
+
+    close_fridge:
+      skill_name: "NoopSkillPolicy"
+      obs_skill_inputs: ["obj_start_sensor"]
+      max_skill_steps: 1
+      apply_postconds: True
+
+    pick:
+      skill_name: "NoopSkillPolicy"
+      obs_skill_inputs: ["obj_start_sensor"]
+      max_skill_steps: 1
+      apply_postconds: True
+
+    place:
+      skill_name: "NoopSkillPolicy"
+      obs_skill_inputs: ["obj_goal_sensor"]
+      max_skill_steps: 1
+      apply_postconds: True
+
+    nav_to_obj:
+      skill_name: "OracleNavPolicy"
+      obs_skill_inputs: ["obj_start_sensor", "abs_obj_start_sensor", "obj_goal_sensor", "abs_obj_goal_sensor"]
+      max_skill_steps: 300
+
+    wait_skill:
+      skill_name: "WaitSkillPolicy"
+      max_skill_steps: -1
+      force_end_on_timeout: False
+
+    reset_arm_skill:
+      skill_name: "ResetArmSkill"
+      max_skill_steps: 50
+      reset_joint_state: [-4.50e-01, -1.07e00, 9.95e-02, 9.38e-01, -7.88e-04, 1.57e00, 4.62e-03]
+      force_end_on_timeout: False
+
+  use_skills:
+    open_cab: "open_cab"
+    open_fridge: "open_fridge"
+    close_cab: "close_cab"
+    close_fridge: "close_fridge"
+    pick: "pick"
+    place: "place"
+    nav: "nav_to_obj"
+    nav_to_receptacle: "nav_to_obj"
+    wait: "wait_skill"
+    reset_arm: "reset_arm_skill"
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_srl.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_srl.yaml
new file mode 100644
index 0000000000..3ac608e23b
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_srl.yaml
@@ -0,0 +1,71 @@
+name: "HierarchicalPolicy"
+obs_transforms:
+  add_virtual_keys:
+    virtual_keys:
+      "goal_to_agent_gps_compass": 2
+hierarchical_policy:
+  high_level_policy:
+    name: "FixedHighLevelPolicy"
+    add_arm_rest: True
+  defined_skills:
+    open_cab:
+      skill_name: "ArtObjSkillPolicy"
+      name: "PointNavResNetPolicy"
+      load_ckpt_file: "data/models/open_cab.pth"
+
+    open_fridge:
+      skill_name: "ArtObjSkillPolicy"
+      skill_name: "ArtObjSkillPolicy"
+      load_ckpt_file: "data/models/open_fridge.pth"
+
+    close_cab:
+      skill_name: "ArtObjSkillPolicy"
+      obs_skill_inputs: ["obj_start_sensor"]
+      load_ckpt_file: "data/models/close_cab.pth"
+
+    close_fridge:
+      skill_name: "ArtObjSkillPolicy"
+      obs_skill_inputs: ["obj_start_sensor"]
+      load_ckpt_file: "data/models/close_fridge.pth"
+
+    pick:
+      skill_name: "PickSkillPolicy"
+      obs_skill_inputs: ["obj_start_sensor"]
+      load_ckpt_file: "data/models/pick.pth"
+
+    place:
+      skill_name: "PlaceSkillPolicy"
+      obs_skill_inputs: ["obj_goal_sensor"]
+      load_ckpt_file: "data/models/place.pth"
+
+    wait_skill:
+      skill_name: "WaitSkillPolicy"
+      max_skill_steps: -1
+      force_end_on_timeout: False
+
+    nav_to_obj:
+      skill_name: "NavSkillPolicy"
+      obs_skill_inputs: ["goal_to_agent_gps_compass"]
+      obs_skill_input_dim: 2
+      load_ckpt_file: "data/models/nav.pth"
+      max_skill_steps: 300
+      force_end_on_timeout: False
+
+    reset_arm_skill:
+      skill_name: "ResetArmSkill"
+      max_skill_steps: 50
+      reset_joint_state: [-4.50e-01, -1.08e00, 9.95e-02, 9.38e-01, -7.88e-04, 1.57e00, 4.62e-03]
+      force_end_on_timeout: False
+
+  use_skills:
+    # Uncomment if you are also using these skills
+    # open_cab: "open_cab"
+    # open_fridge: "open_fridge"
+    # close_cab: "open_cab"
+    # close_fridge: "open_fridge"
+    pick: "pick"
+    place: "place"
+    nav: "nav_to_obj"
+    nav_to_receptacle: "nav_to_obj"
+    wait: "wait_skill"
+    reset_arm: "reset_arm_skill"
\ No newline at end of file
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/monolithic.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/monolithic.yaml
new file mode 100644
index 0000000000..9a1850d158
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/monolithic.yaml
@@ -0,0 +1,4 @@
+name: "PointNavResNetPolicy"
+action_distribution_type: "gaussian"
+action_dist:
+  use_log_std: True
\ No newline at end of file
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml
index 1675eadb6c..5ad654137d 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml
@@ -1,12 +1,14 @@
 # @package _global_
-# Pick and place are kinematically simulated.
-# Navigation is fully simulated.
+# Pick and place are instantaneous skills, 
+# where environment is set to final state.
+# Navigation is kinematically simulated.
 
 defaults:
   - /benchmark/rearrange: rearrange_easy
   - /habitat_baselines: habitat_baselines_rl_config_base
   - /habitat_baselines/rl/policy/obs_transforms:
     - add_virtual_keys_base
+  - /habitat_baselines/rl/policy: hierarchical_srl_onav
   - /habitat/task/actions:
     - pddl_apply_action
     - oracle_nav_action
@@ -17,7 +19,6 @@ defaults:
 
 habitat:
   gym:
-    auto_name: RearrangeEasy
     obs_keys:
       - robot_head_depth
       - relative_resting_position
@@ -34,15 +35,12 @@ habitat_baselines:
   verbose: False
   trainer_name: "ddppo"
   torch_gpu_id: 0
-  tensorboard_dir: "tb"
   rollout_storage: "HrlRolloutStorage"
   updater_name: "HrlPPO"
   distrib_updater_name: "HrlDDPPO"
-  video_dir: "video_dir"
   video_fps: 30
   video_render_views:
     - "third_rgb_sensor"
-  test_episode_count: -1
   eval_ckpt_path_dir: ""
   num_environments: 3
   writer_type: 'tb'
@@ -61,86 +59,6 @@ habitat_baselines:
     video_option: ["disk"]
 
   rl:
-    policy:
-        name: "HierarchicalPolicy"
-        obs_transforms:
-          add_virtual_keys:
-            virtual_keys:
-              "goal_to_agent_gps_compass": 2
-        hierarchical_policy:
-          high_level_policy:
-            name: "NeuralHighLevelPolicy"
-            allow_other_place: False
-            hidden_dim: 512
-            use_rnn: True
-            rnn_type: 'LSTM'
-            backbone: resnet18
-            normalize_visual_inputs: False
-            num_rnn_layers: 2
-            policy_input_keys:
-              - "robot_head_depth"
-          defined_skills:
-            open_cab:
-              skill_name: "NoopSkillPolicy"
-              max_skill_steps: 1
-              apply_postconds: True
-
-            open_fridge:
-              skill_name: "NoopSkillPolicy"
-              max_skill_steps: 1
-              apply_postconds: True
-
-            close_cab:
-              skill_name: "NoopSkillPolicy"
-              obs_skill_inputs: ["obj_start_sensor"]
-              max_skill_steps: 1
-
-            close_fridge:
-              skill_name: "NoopSkillPolicy"
-              obs_skill_inputs: ["obj_start_sensor"]
-              max_skill_steps: 1
-              apply_postconds: True
-
-            pick:
-              skill_name: "NoopSkillPolicy"
-              obs_skill_inputs: ["obj_start_sensor"]
-              max_skill_steps: 1
-              apply_postconds: True
-
-            place:
-              skill_name: "NoopSkillPolicy"
-              obs_skill_inputs: ["obj_goal_sensor"]
-              max_skill_steps: 1
-              apply_postconds: True
-
-            nav_to_obj:
-              skill_name: "OracleNavPolicy"
-              obs_skill_inputs: ["obj_start_sensor", "abs_obj_start_sensor", "obj_goal_sensor", "abs_obj_goal_sensor"]
-              max_skill_steps: 300
-
-            wait_skill:
-              skill_name: "WaitSkillPolicy"
-              max_skill_steps: -1
-              force_end_on_timeout: False
-
-            reset_arm_skill:
-              skill_name: "ResetArmSkill"
-              max_skill_steps: 50
-              reset_joint_state: [-4.5003259e-01, -1.0799699e00, 9.9526465e-02, 9.3869519e-01, -7.8854430e-04, 1.5702540e00, 4.6168058e-03]
-              force_end_on_timeout: False
-
-          use_skills:
-            open_cab: "open_cab"
-            open_fridge: "open_fridge"
-            close_cab: "close_cab"
-            close_fridge: "close_fridge"
-            pick: "pick"
-            place: "place"
-            nav: "nav_to_obj"
-            nav_to_receptacle: "nav_to_obj"
-            wait: "wait_skill"
-            reset_arm: "reset_arm_skill"
-
     ppo:
       # ppo params
       clip_param: 0.2
@@ -155,18 +73,6 @@ habitat_baselines:
       use_gae: True
       gamma: 0.99
       tau: 0.95
-      use_linear_clip_decay: False
-      use_linear_lr_decay: False
-      reward_window_size: 50
-
-      use_normalized_advantage: False
-
-      hidden_size: 512
-
-      # Use double buffered sampling, typically helps
-      # when environment time is similar or larger than
-      # policy inference time during rollout generation
-      use_double_buffered_sampler: False
 
     ddppo:
       sync_frac: 0.6
@@ -182,7 +88,6 @@ habitat_baselines:
       train_encoder: True
       # Whether to reset the critic linear layer
       reset_critic: False
-
       # Model parameters
       backbone: resnet18
       rnn_type: LSTM
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_rearrange.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_rearrange.yaml
index f96bfaf471..38a7ce3537 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_rearrange.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_rearrange.yaml
@@ -2,6 +2,7 @@
 
 defaults:
   - /benchmark/rearrange: rearrange
+  - /habitat_baselines/rl/policy: monolithic
   - /habitat_baselines: habitat_baselines_rl_config_base
   - _self_
 
@@ -32,11 +33,6 @@ habitat_baselines:
     video_option: ["disk"]
 
   rl:
-    policy:
-      name: "PointNavResNetPolicy"
-      action_distribution_type: "gaussian"
-      action_dist:
-        use_log_std: True
     ppo:
       # ppo params
       clip_param: 0.2
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_rearrange_easy.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_rearrange_easy.yaml
index ed08e7a5c2..253db1cc6a 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_rearrange_easy.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_rearrange_easy.yaml
@@ -3,6 +3,7 @@
 defaults:
   - /benchmark/rearrange: rearrange_easy
   - /habitat_baselines: habitat_baselines_rl_config_base
+  - /habitat_baselines/rl/policy: monolithic
   - _self_
 
 habitat_baselines:
@@ -32,11 +33,6 @@ habitat_baselines:
     video_option: ["disk"]
 
   rl:
-    policy:
-      name: "PointNavResNetPolicy"
-      action_distribution_type: "gaussian"
-      action_dist:
-       use_log_std: True
     ppo:
       # ppo params
       clip_param: 0.2
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_skill.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_skill.yaml
index 56018e9435..9f0d2e8d94 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_skill.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_skill.yaml
@@ -3,6 +3,7 @@
 defaults:
   - /benchmark/rearrange: pick
   - /habitat_baselines: habitat_baselines_rl_config_base
+  - /habitat_baselines/rl/policy: monolithic
   - _self_
 
 habitat_baselines:
@@ -33,10 +34,7 @@ habitat_baselines:
 
   rl:
     policy:
-        name: "PointNavResNetPolicy"
-        action_distribution_type: "gaussian"
         action_dist:
-           use_log_std: True
            clamp_std: True
            std_init: -1.0
            use_std_param: True
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml b/habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml
index 3f2fe45fb5..ef9e31daf2 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml
@@ -5,6 +5,7 @@ defaults:
   - /habitat_baselines: habitat_baselines_rl_config_base
   - /habitat_baselines/rl/policy/obs_transforms:
     - add_virtual_keys_base
+  - /habitat_baselines/rl/policy: hierarchical_srl_learned_nav
   - _self_
 
 
@@ -35,90 +36,6 @@ habitat_baselines:
     video_option: ["disk"]
 
   rl:
-    policy:
-        name: "HierarchicalPolicy"
-        obs_transforms:
-          add_virtual_keys:
-            virtual_keys:
-              "goal_to_agent_gps_compass": 2
-        hierarchical_policy:
-          high_level_policy:
-            name: "FixedHighLevelPolicy"
-            add_arm_rest: True
-          defined_skills:
-            open_cab:
-              skill_name: "ArtObjSkillPolicy"
-              name: "PointNavResNetPolicy"
-              load_ckpt_file: "data/models/open_cab.pth"
-              max_skill_steps: 200
-
-            open_fridge:
-              skill_name: "ArtObjSkillPolicy"
-              load_ckpt_file: "data/models/open_fridge.pth"
-              max_skill_steps: 200
-
-            close_cab:
-              skill_name: "ArtObjSkillPolicy"
-              obs_skill_inputs: ["obj_start_sensor"]
-              load_ckpt_file: "data/models/close_cab.pth"
-              max_skill_steps: 200
-
-            close_fridge:
-              skill_name: "ArtObjSkillPolicy"
-              obs_skill_inputs: ["obj_start_sensor"]
-              load_ckpt_file: "data/models/close_fridge.pth"
-              max_skill_steps: 200
-
-            pick:
-              skill_name: "PickSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              at_resting_threshold: 0.15
-              obs_skill_inputs: ["obj_start_sensor"]
-              load_ckpt_file: "data/models/pick.pth"
-              max_skill_steps: 200
-
-            place:
-              skill_name: "PlaceSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              at_resting_threshold: 0.15
-              obs_skill_inputs: ["obj_goal_sensor"]
-              load_ckpt_file: "data/models/place.pth"
-              max_skill_steps: 200
-
-            nav_to_obj:
-              skill_name: "NavSkillPolicy"
-              obs_skill_inputs: ["goal_to_agent_gps_compass"]
-              obs_skill_input_dim: 2
-              load_ckpt_file: "data/models/nav.pth"
-              max_skill_steps: 300
-              force_end_on_timeout: False
-
-            wait_skill:
-              skill_name: "WaitSkillPolicy"
-              max_skill_steps: -1
-              force_end_on_timeout: False
-
-            reset_arm_skill:
-              skill_name: "ResetArmSkill"
-              max_skill_steps: 50
-              reset_joint_state: [-4.5003259e-01, -1.0799699e00, 9.9526465e-02, 9.3869519e-01, -7.8854430e-04, 1.5702540e00, 4.6168058e-03]
-              force_end_on_timeout: False
-
-          use_skills:
-            # Uncomment if you are also using these skills
-            # open_cab: "open_cab"
-            # open_fridge: "open_fridge"
-            # close_cab: "open_cab"
-            # close_fridgE: "open_fridge"
-            pick: "pick"
-            place: "place"
-            nav: "nav_to_obj"
-            nav_to_receptacle: "nav_to_obj"
-            wait: "wait_skill"
-            reset_arm: "reset_arm_skill"
-
     ddppo:
       pretrained: False
       pretrained_encoder: False
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml b/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml
index f456242704..6a4a769b04 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml
@@ -5,6 +5,7 @@ defaults:
   - /habitat_baselines: habitat_baselines_rl_config_base
   - /habitat_baselines/rl/policy/obs_transforms:
     - add_virtual_keys_base
+  - /habitat_baselines/rl/policy: hierarchical_srl_oracle_nav
   - /habitat/task/actions:
     - oracle_nav_action
     - arm_action
@@ -26,6 +27,7 @@ habitat:
       - is_holding
       - localization_sensor
 
+
 habitat_baselines:
   verbose: False
   trainer_name: "ddppo"
@@ -33,8 +35,6 @@ habitat_baselines:
   tensorboard_dir: "tb"
   video_dir: "video_dir"
   video_fps: 30
-  video_render_views:
-    - "third_rgb_sensor"
   test_episode_count: -1
   eval_ckpt_path_dir: ""
   num_environments: 1
@@ -53,87 +53,6 @@ habitat_baselines:
     video_option: ["disk"]
 
   rl:
-    policy:
-        name: "HierarchicalPolicy"
-        obs_transforms:
-          add_virtual_keys:
-            virtual_keys:
-              "goal_to_agent_gps_compass": 2
-        hierarchical_policy:
-          high_level_policy:
-            name: "FixedHighLevelPolicy"
-            add_arm_rest: True
-          defined_skills:
-            open_cab:
-              skill_name: "ArtObjSkillPolicy"
-              name: "PointNavResNetPolicy"
-              load_ckpt_file: "data/models/open_cab.pth"
-              max_skill_steps: 200
-
-            open_fridge:
-              skill_name: "ArtObjSkillPolicy"
-              load_ckpt_file: "data/models/open_fridge.pth"
-              max_skill_steps: 200
-
-            close_cab:
-              skill_name: "ArtObjSkillPolicy"
-              obs_skill_inputs: ["obj_start_sensor"]
-              load_ckpt_file: "data/models/close_cab.pth"
-              max_skill_steps: 200
-
-            close_fridge:
-              skill_name: "ArtObjSkillPolicy"
-              obs_skill_inputs: ["obj_start_sensor"]
-              load_ckpt_file: "data/models/close_fridge.pth"
-              max_skill_steps: 200
-
-            pick:
-              skill_name: "PickSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              at_resting_threshold: 0.15
-              obs_skill_inputs: ["obj_start_sensor"]
-              load_ckpt_file: "data/models/pick.pth"
-              max_skill_steps: 200
-
-            place:
-              skill_name: "PlaceSkillPolicy"
-              name: "PointNavResNetPolicy"
-              action_distribution_type: "gaussian"
-              at_resting_threshold: 0.15
-              obs_skill_inputs: ["obj_goal_sensor"]
-              load_ckpt_file: "data/models/place.pth"
-              max_skill_steps: 200
-
-            wait_skill:
-              skill_name: "WaitSkillPolicy"
-              max_skill_steps: -1
-              force_end_on_timeout: False
-
-            gt_nav:
-              skill_name: "OracleNavPolicy"
-              obs_skill_inputs: ["obj_start_sensor", "abs_obj_start_sensor", "obj_goal_sensor", "abs_obj_goal_sensor"]
-              max_skill_steps: 300
-
-            reset_arm_skill:
-              skill_name: "ResetArmSkill"
-              max_skill_steps: 50
-              reset_joint_state: [-4.5003259e-01, -1.0799699e00, 9.9526465e-02, 9.3869519e-01, -7.8854430e-04, 1.5702540e00, 4.6168058e-03]
-              force_end_on_timeout: False
-
-          use_skills:
-            # Uncomment if you are also using these skills
-            # open_cab: "open_cab"
-            # open_fridge: "open_fridge"
-            # close_cab: "open_cab"
-            # close_fridge: "open_fridge"
-            pick: "pick"
-            place: "place"
-            nav: "gt_nav"
-            nav_to_receptacle: "gt_nav"
-            wait: "wait_skill"
-            reset_arm: "reset_arm_skill"
-
     ddppo:
       pretrained: False
       pretrained_encoder: False
diff --git a/habitat-lab/habitat/config/habitat/task/rearrange/prepare_groceries.yaml b/habitat-lab/habitat/config/habitat/task/rearrange/prepare_groceries.yaml
index d404648720..bb54918c66 100644
--- a/habitat-lab/habitat/config/habitat/task/rearrange/prepare_groceries.yaml
+++ b/habitat-lab/habitat/config/habitat/task/rearrange/prepare_groceries.yaml
@@ -28,6 +28,7 @@ defaults:
     - end_effector_sensor
     - target_start_gps_compass_sensor
     - target_goal_gps_compass_sensor
+    - localization_sensor
   - _self_
 
 type: RearrangeCompositeTask-v0
diff --git a/habitat-lab/habitat/config/habitat/task/rearrange/rearrange_easy_multi_agent.yaml b/habitat-lab/habitat/config/habitat/task/rearrange/rearrange_easy_multi_agent.yaml
index b4bcb800f2..1f84e938a7 100644
--- a/habitat-lab/habitat/config/habitat/task/rearrange/rearrange_easy_multi_agent.yaml
+++ b/habitat-lab/habitat/config/habitat/task/rearrange/rearrange_easy_multi_agent.yaml
@@ -24,6 +24,7 @@ defaults:
     - end_effector_sensor
     - target_start_gps_compass_sensor
     - target_goal_gps_compass_sensor
+    - localization_sensor
   - _self_
 
 type: RearrangeCompositeTask-v0
diff --git a/habitat-lab/habitat/config/habitat/task/rearrange/set_table.yaml b/habitat-lab/habitat/config/habitat/task/rearrange/set_table.yaml
index 662f8da3bb..f11cc889d9 100644
--- a/habitat-lab/habitat/config/habitat/task/rearrange/set_table.yaml
+++ b/habitat-lab/habitat/config/habitat/task/rearrange/set_table.yaml
@@ -28,6 +28,7 @@ defaults:
     - end_effector_sensor
     - target_start_gps_compass_sensor
     - target_goal_gps_compass_sensor
+    - localization_sensor
   - _self_
 
 type: RearrangeCompositeTask-v0
diff --git a/habitat-lab/habitat/config/habitat/task/rearrange/tidy_house.yaml b/habitat-lab/habitat/config/habitat/task/rearrange/tidy_house.yaml
index f1452911a4..6d176c934a 100644
--- a/habitat-lab/habitat/config/habitat/task/rearrange/tidy_house.yaml
+++ b/habitat-lab/habitat/config/habitat/task/rearrange/tidy_house.yaml
@@ -28,6 +28,7 @@ defaults:
     - end_effector_sensor
     - target_start_gps_compass_sensor
     - target_goal_gps_compass_sensor
+    - localization_sensor
   - _self_
 
 type: RearrangeCompositeTask-v0

From bb2e4da257cecd936b3041e81342f520809bce70 Mon Sep 17 00:00:00 2001
From: Xavier Puig <xavierpuig@fb.com>
Date: Mon, 9 Jan 2023 09:58:36 -0800
Subject: [PATCH 05/48] Update README.md

Fixed link for PDDL
---
 habitat-baselines/habitat_baselines/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/habitat-baselines/habitat_baselines/README.md b/habitat-baselines/habitat_baselines/README.md
index e2e06f258d..a37992654c 100644
--- a/habitat-baselines/habitat_baselines/README.md
+++ b/habitat-baselines/habitat_baselines/README.md
@@ -55,7 +55,7 @@ Change the `/benchmark/nav/pointnav: pointnav_gibson` in `habitat_baselines/conf
 
 ### Hierarchical Reinforcement Learning (HRL)
 
-We provide a two-layer hierarchical policy class, consisting of a low-level skill that moves the robot, and a high-level policy that reasons about which low-level skill to use in the current state. This can be especially powerful in long-horizon mobile manipulation tasks, like those introduced in [Habitat2.0](https://arxiv.org/abs/2106.14405). Both the low- and high- level can be either learned or an oracle. For oracle high-level we use [PDDL] (https://planning.wiki/guide/whatis/pddl), and for oracle low-level we use instantaneous transitions, with the environment set to the final desired state. Additionally, for navigation, we provide an oracle navigation skill that uses A-star and the map of the environment to move the robot to its goal.
+We provide a two-layer hierarchical policy class, consisting of a low-level skill that moves the robot, and a high-level policy that reasons about which low-level skill to use in the current state. This can be especially powerful in long-horizon mobile manipulation tasks, like those introduced in [Habitat2.0](https://arxiv.org/abs/2106.14405). Both the low- and high- level can be either learned or an oracle. For oracle high-level we use [PDDL](https://planning.wiki/guide/whatis/pddl), and for oracle low-level we use instantaneous transitions, with the environment set to the final desired state. Additionally, for navigation, we provide an oracle navigation skill that uses A-star and the map of the environment to move the robot to its goal.
 
 To run the following examples, you need the [ReplicaCAD dataset](https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md#replicacad). 
 

From 628063d3ace15548aba8b21e94eb0a9ac143f7cb Mon Sep 17 00:00:00 2001
From: Xavier Puig <xavierpuig@fb.com>
Date: Mon, 9 Jan 2023 11:41:00 -0800
Subject: [PATCH 06/48] Match tensor device when checking if the skills is done

---
 habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index 4e95a29221..3344fb7be9 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -195,6 +195,8 @@ def should_terminate(
                 )
                 self._delay_term[env_i] = True
                 is_skill_done[i] = 0.0
+        
+        is_skill_done = is_skill_done.to(hl_says_term.device)
         is_skill_done |= hl_says_term
 
         if bad_terminate.sum() > 0:

From d179ecf9b408f7519b9149bacdfe35193cc21597 Mon Sep 17 00:00:00 2001
From: Xavier Puig <xavierpuig@fb.com>
Date: Fri, 13 Jan 2023 09:03:30 -0800
Subject: [PATCH 07/48] Train hl modif2 (#1076)

* make should_terminate consistently on cpu

* Corrected config files for skills using noop + bug in pddl_apply_action

* bug fixes in apply postcond

* Update hierarchical_policy.py
---
 .../rl/policy/hierarchical_tp_noop_onav.yaml  |  2 ++
 .../rl/hrl/hierarchical_policy.py             |  8 ++++--
 .../rl/hrl/skills/oracle_nav.py               |  2 +-
 .../habitat_baselines/rl/hrl/skills/skill.py  | 25 +++++++++++--------
 4 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_noop_onav.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_noop_onav.yaml
index d090a528c7..4540b9aa88 100644
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_noop_onav.yaml
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_noop_onav.yaml
@@ -36,12 +36,14 @@ hierarchical_policy:
       obs_skill_inputs: ["obj_start_sensor"]
       max_skill_steps: 1
       apply_postconds: True
+      force_end_on_timeout: False
 
     place:
       skill_name: "NoopSkillPolicy"
       obs_skill_inputs: ["obj_goal_sensor"]
       max_skill_steps: 1
       apply_postconds: True
+      force_end_on_timeout: False
 
     nav_to_obj:
       skill_name: "OracleNavPolicy"
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
index b8f63ad9eb..ad0226262e 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
@@ -212,7 +212,7 @@ def act(
             masks,
             self._cur_skills,
             log_info,
-        )
+        ).cpu()
         # Compute the actions from the current skills
         actions = torch.zeros(
             (self._num_envs, get_num_actions(self._action_space)),
@@ -239,9 +239,12 @@ def act(
                 # Policy has not prediced a skill yet.
                 should_terminate[batch_ids] = 1.0
                 continue
+            # TODO: either change name of the function or assign actions somewhere
+            # else. Updating actions in should_terminate is counterintuitive 
             (
                 should_terminate[batch_ids],
                 bad_should_terminate[batch_ids],
+                actions[batch_ids]
             ) = self._skills[skill_id].should_terminate(
                 **dat,
                 batch_idx=batch_ids,
@@ -315,7 +318,8 @@ def act(
             )
 
             # LL skills are not allowed to terminate the overall episode.
-            actions[batch_ids] = action_data.actions
+            actions[batch_ids] += action_data.actions
+            # Add actions from apply_postcond
             rnn_hidden_states[batch_ids] = action_data.rnn_hidden_states
         actions[:, self._stop_action_idx] = 0.0
 
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
index 374e17494c..5a27d3f527 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
@@ -116,7 +116,7 @@ def _is_skill_done(
         masks,
         batch_idx,
     ) -> torch.BoolTensor:
-        ret = torch.zeros(masks.shape[0], dtype=torch.bool).to(masks.device)
+        ret = torch.zeros(masks.shape[0], dtype=torch.bool)
 
         cur_pos = observations[LocalizationSensor.cls_uuid].cpu()
 
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index 3344fb7be9..a0ca5854b5 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -38,7 +38,9 @@ def __init__(
             None for _ in range(self._batch_size)
         ]
 
-        if "pddl_apply_action" in action_space:
+        # TODO: for some reason this doesnt work with "pddl_apply_action" in action_space
+        # and needs to go through the keys argument
+        if "pddl_apply_action" in list(action_space.keys()):
             self._pddl_ac_start, _ = find_action_range(
                 action_space, "pddl_apply_action"
             )
@@ -102,6 +104,9 @@ def _apply_postcond(
         env_i,
         idx,
     ):
+        """
+        Modifies the actions according to the postconditions set in self._pddl_problem.actions[skill_name]
+        """
         skill_args = self._raw_skill_args[env_i]
         action = self._pddl_problem.actions[skill_name]
 
@@ -137,7 +142,7 @@ def _apply_postcond(
         apply_action.set_param_values(entities)
 
         log_info[env_i]["pddl_action"] = apply_action.compact_str
-        return actions
+        return actions[idx]
 
     def should_terminate(
         self,
@@ -156,7 +161,7 @@ def should_terminate(
         """
         is_skill_done = self._is_skill_done(
             observations, rnn_hidden_states, prev_actions, masks, batch_idx
-        )
+        ).cpu()
         if is_skill_done.sum() > 0:
             self._internal_log(
                 f"Requested skill termination {is_skill_done}",
@@ -177,10 +182,10 @@ def should_terminate(
         if self._config.max_skill_steps > 0:
             over_max_len = cur_skill_step > self._config.max_skill_steps
             if self._config.force_end_on_timeout:
-                bad_terminate = over_max_len
+                bad_terminate = over_max_len.cpu()
             else:
-                is_skill_done = is_skill_done | over_max_len
-
+                is_skill_done = is_skill_done | over_max_len.cpu()
+        new_actions = torch.zeros_like(actions)
         for i, env_i in enumerate(batch_idx):
             if self._delay_term[env_i]:
                 self._delay_term[env_i] = False
@@ -190,22 +195,20 @@ def should_terminate(
                 and is_skill_done[i] == 1.0
                 and hl_says_term[i] == 0.0
             ):
-                actions = self._apply_postcond(
+                new_actions[i] = self._apply_postcond(
                     actions, log_info, skill_name[i], env_i, i
                 )
                 self._delay_term[env_i] = True
                 is_skill_done[i] = 0.0
         
-        is_skill_done = is_skill_done.to(hl_says_term.device)
-        is_skill_done |= hl_says_term
+        is_skill_done |= hl_says_term.cpu()
 
         if bad_terminate.sum() > 0:
             self._internal_log(
                 f"Bad terminating due to timeout {cur_skill_step}, {bad_terminate}",
                 observations,
             )
-
-        return is_skill_done, bad_terminate
+        return is_skill_done.cpu(), bad_terminate.cpu(), new_actions
 
     def on_enter(
         self,

From 8eddcd153858fc2cb9892e2355e978f9bd4e05b5 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Thu, 19 Jan 2023 21:59:24 -0800
Subject: [PATCH 08/48] Fixed RNN problem

---
 .../common/rollout_storage.py                 |  22 ++++-
 .../defined_skills/nn_skills.yaml             |  43 ++++++++
 .../defined_skills/noop_skills.yaml           |  52 ++++++++++
 .../rl/policy/hierarchical_srl_onav.yaml      |  77 ---------------
 .../rl/policy/hierarchical_tp_noop_onav.yaml  |  74 --------------
 .../rl/policy/hierarchical_tp_srl.yaml        |  71 -------------
 .../habitat_baselines/rl/policy/hl_fixed.yaml |  21 ++++
 .../rl/policy/hl_neural.yaml                  |  29 ++++++
 .../config/rearrange/rl_hierarchical.yaml     |  66 +++++++++++++
 ...v.yaml => rl_hierarchical_oracle_nav.yaml} |  19 ++--
 .../config/rearrange/rl_skill.yaml            |   1 -
 .../config/rearrange/spap_pick.yaml           |  39 --------
 .../config/rearrange/spap_reach_state.yaml    |  42 --------
 .../config/rearrange/tp_srl.yaml              |  42 --------
 .../config/rearrange/tp_srl_oracle_nav.yaml   |  61 ------------
 .../rl/hrl/hierarchical_policy.py             |  93 ++++++++++++------
 .../rl/hrl/hl/high_level_policy.py            |  27 +++++
 .../rl/hrl/hl/neural_policy.py                |  40 +++++++-
 .../habitat_baselines/rl/hrl/hrl_ppo.py       |  14 +--
 .../rl/hrl/hrl_rollout_storage.py             |  41 ++++++--
 .../rl/hrl/skills/art_obj.py                  |   2 +-
 .../rl/hrl/skills/nn_skill.py                 |  10 +-
 .../rl/hrl/skills/oracle_nav.py               |   7 +-
 .../habitat_baselines/rl/hrl/skills/reset.py  |  19 ++--
 .../habitat_baselines/rl/hrl/skills/skill.py  |  15 +--
 .../habitat_baselines/rl/hrl/skills/wait.py   |   4 +-
 .../habitat_baselines/rl/ppo/policy.py        |  49 ++++++---
 .../habitat_baselines/rl/ppo/ppo_trainer.py   |  62 ++++++------
 ...55.s-164-67-210-79.resnet.ucla.edu.58058.0 | Bin 0 -> 40 bytes
 .../benchmark/rearrange/rearrange_easy.yaml   |   1 +
 test/test_baseline_training.py                |  56 ++++++-----
 31 files changed, 536 insertions(+), 563 deletions(-)
 create mode 100644 habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/nn_skills.yaml
 create mode 100644 habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/noop_skills.yaml
 delete mode 100644 habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_srl_onav.yaml
 delete mode 100644 habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_noop_onav.yaml
 delete mode 100644 habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_srl.yaml
 create mode 100644 habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_fixed.yaml
 create mode 100644 habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
 create mode 100644 habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
 rename habitat-baselines/habitat_baselines/config/rearrange/{rl_hl_srl_onav.yaml => rl_hierarchical_oracle_nav.yaml} (81%)
 delete mode 100644 habitat-baselines/habitat_baselines/config/rearrange/spap_pick.yaml
 delete mode 100644 habitat-baselines/habitat_baselines/config/rearrange/spap_reach_state.yaml
 delete mode 100644 habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml
 delete mode 100644 habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml
 create mode 100644 habitat-baselines/tb/events.out.tfevents.1674193555.s-164-67-210-79.resnet.ucla.edu.58058.0

diff --git a/habitat-baselines/habitat_baselines/common/rollout_storage.py b/habitat-baselines/habitat_baselines/common/rollout_storage.py
index 8bc0b3b9f2..50d11942c2 100644
--- a/habitat-baselines/habitat_baselines/common/rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/common/rollout_storage.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import warnings
-from typing import Any, Dict, Iterator, Optional, Tuple
+from typing import Any, Dict, Iterator, Optional
 
 import numpy as np
 import torch
@@ -16,6 +16,10 @@
     build_pack_info_from_dones,
     build_rnn_build_seq_info,
 )
+from habitat_baselines.utils.common import (
+    get_num_actions,
+    is_continuous_action_space,
+)
 
 
 @baseline_registry.register_storage
@@ -30,10 +34,22 @@ def __init__(
         action_space,
         recurrent_hidden_state_size,
         num_recurrent_layers=1,
-        action_shape: Optional[Tuple[int]] = None,
         is_double_buffered: bool = False,
-        discrete_actions: bool = True,
     ):
+
+        if is_continuous_action_space(action_space):
+            # Assume ALL actions are NOT discrete
+            action_shape = (
+                get_num_actions(
+                    action_space,
+                ),
+            )
+            discrete_actions = False
+        else:
+            # For discrete pointnav
+            action_shape = (1,)
+            discrete_actions = True
+
         self.buffers = TensorDict()
         self.buffers["observations"] = TensorDict()
 
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/nn_skills.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/nn_skills.yaml
new file mode 100644
index 0000000000..24d7cbca38
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/nn_skills.yaml
@@ -0,0 +1,43 @@
+open_cab:
+  skill_name: "ArtObjSkillPolicy"
+  load_ckpt_file: "data/models/open_cab.pth"
+
+open_fridge:
+  skill_name: "ArtObjSkillPolicy"
+  load_ckpt_file: "data/models/open_fridge.pth"
+
+close_cab:
+  skill_name: "ArtObjSkillPolicy"
+  load_ckpt_file: "data/models/close_cab.pth"
+
+close_fridge:
+  skill_name: "ArtObjSkillPolicy"
+  load_ckpt_file: "data/models/close_fridge.pth"
+
+pick:
+  skill_name: "PickSkillPolicy"
+  obs_skill_inputs: ["obj_start_sensor"]
+  load_ckpt_file: "data/models/pick.pth"
+
+place:
+  skill_name: "PlaceSkillPolicy"
+  obs_skill_inputs: ["obj_goal_sensor"]
+  load_ckpt_file: "data/models/place.pth"
+
+wait_skill:
+  skill_name: "WaitSkillPolicy"
+  max_skill_steps: -1
+  force_end_on_timeout: False
+
+nav_to_obj:
+  skill_name: "NavSkillPolicy"
+  obs_skill_inputs: ["goal_to_agent_gps_compass"]
+  load_ckpt_file: "data/models/nav.pth"
+  max_skill_steps: 300
+  obs_skill_input_dim: 2
+
+reset_arm_skill:
+  skill_name: "ResetArmSkill"
+  max_skill_steps: 50
+  reset_joint_state: [-4.50e-01, -1.08e00, 9.95e-02, 9.38e-01, -7.88e-04, 1.57e00, 4.62e-03]
+  force_end_on_timeout: False
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/noop_skills.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/noop_skills.yaml
new file mode 100644
index 0000000000..c8b1a34e29
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/noop_skills.yaml
@@ -0,0 +1,52 @@
+open_cab:
+  skill_name: "NoopSkillPolicy"
+  max_skill_steps: 1
+  apply_postconds: True
+
+open_fridge:
+  skill_name: "NoopSkillPolicy"
+  max_skill_steps: 1
+  apply_postconds: True
+
+close_cab:
+  skill_name: "NoopSkillPolicy"
+  obs_skill_inputs: ["obj_start_sensor"]
+  max_skill_steps: 1
+
+close_fridge:
+  skill_name: "NoopSkillPolicy"
+  obs_skill_inputs: ["obj_start_sensor"]
+  max_skill_steps: 1
+  apply_postconds: True
+
+pick:
+  skill_name: "NoopSkillPolicy"
+  obs_skill_inputs: ["obj_start_sensor"]
+  max_skill_steps: 1
+  apply_postconds: True
+  force_end_on_timeout: False
+
+place:
+  skill_name: "NoopSkillPolicy"
+  obs_skill_inputs: ["obj_goal_sensor"]
+  max_skill_steps: 1
+  apply_postconds: True
+  force_end_on_timeout: False
+
+wait_skill:
+  skill_name: "WaitSkillPolicy"
+  max_skill_steps: -1
+
+nav_to_obj:
+  skill_name: "NoopSkillPolicy"
+  obs_skill_inputs: ["goal_to_agent_gps_compass"]
+  max_skill_steps: 1
+  apply_postconds: True
+  force_end_on_timeout: False
+  obs_skill_input_dim: 2
+
+reset_arm_skill:
+  skill_name: "ResetArmSkill"
+  max_skill_steps: 50
+  reset_joint_state: [-4.50e-01, -1.07e00, 9.95e-02, 9.38e-01, -7.88e-04, 1.57e00, 4.62e-03]
+  force_end_on_timeout: False
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_srl_onav.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_srl_onav.yaml
deleted file mode 100644
index d92283dbc2..0000000000
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_srl_onav.yaml
+++ /dev/null
@@ -1,77 +0,0 @@
-name: "HierarchicalPolicy"
-obs_transforms:
-  add_virtual_keys:
-    virtual_keys:
-      "goal_to_agent_gps_compass": 2
-hierarchical_policy:
-  high_level_policy:
-    name: "NeuralHighLevelPolicy"
-    allow_other_place: False
-    hidden_dim: 512
-    use_rnn: True
-    rnn_type: 'LSTM'
-    backbone: resnet18
-    normalize_visual_inputs: False
-    num_rnn_layers: 2
-    policy_input_keys:
-      - "robot_head_depth"
-  defined_skills:
-    open_cab:
-      skill_name: "ArtObjSkillPolicy"
-      name: "PointNavResNetPolicy"
-      load_ckpt_file: "data/models/open_cab.pth"
-
-    open_fridge:
-      skill_name: "ArtObjSkillPolicy"
-      load_ckpt_file: "data/models/open_fridge.pth"
-
-    close_cab:
-      skill_name: "ArtObjSkillPolicy"
-      obs_skill_inputs: ["obj_start_sensor"]
-      load_ckpt_file: "data/models/close_cab.pth"
-
-    close_fridge:
-      skill_name: "ArtObjSkillPolicy"
-      obs_skill_inputs: ["obj_start_sensor"]
-      load_ckpt_file: "data/models/close_fridge.pth"
-
-    pick:
-      skill_name: "PickSkillPolicy"
-      name: "PointNavResNetPolicy"
-      obs_skill_inputs: ["obj_start_sensor"]
-      load_ckpt_file: "data/models/pick.pth"
-
-    place:
-      skill_name: "PlaceSkillPolicy"
-      name: "PointNavResNetPolicy"
-      obs_skill_inputs: ["obj_goal_sensor"]
-      load_ckpt_file: "data/models/place.pth"
-
-    wait_skill:
-      skill_name: "WaitSkillPolicy"
-      max_skill_steps: -1
-      force_end_on_timeout: False
-
-    oracle_nav:
-      skill_name: "OracleNavPolicy"
-      obs_skill_inputs: ["obj_start_sensor", "abs_obj_start_sensor", "obj_goal_sensor", "abs_obj_goal_sensor"]
-      max_skill_steps: 300
-
-    reset_arm_skill:
-      skill_name: "ResetArmSkill"
-      max_skill_steps: 50
-      reset_joint_state: [-4.50e-01, -1.08e00, 9.95e-02, 9.38e-01, -7.88e-04, 1.57e00, 4.62e-03]
-      force_end_on_timeout: False
-
-  use_skills:
-    # Uncomment if you are also using these skills
-    # open_cab: "open_cab"
-    # open_fridge: "open_fridge"
-    # close_cab: "open_cab"
-    # close_fridge: "open_fridge"
-    pick: "pick"
-    place: "place"
-    nav: "oracle_nav"
-    nav_to_receptacle: "oracle_nav"
-    wait: "wait_skill"
-    reset_arm: "reset_arm_skill"
\ No newline at end of file
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_noop_onav.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_noop_onav.yaml
deleted file mode 100644
index 4540b9aa88..0000000000
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_noop_onav.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-
-name: "HierarchicalPolicy"
-obs_transforms:
-  add_virtual_keys:
-    virtual_keys:
-      "goal_to_agent_gps_compass": 2
-hierarchical_policy:
-  high_level_policy:
-    name: "FixedHighLevelPolicy"
-    add_arm_rest: True
-  
-  defined_skills:
-    open_cab:
-      skill_name: "NoopSkillPolicy"
-      max_skill_steps: 1
-      apply_postconds: True
-
-    open_fridge:
-      skill_name: "NoopSkillPolicy"
-      max_skill_steps: 1
-      apply_postconds: True
-
-    close_cab:
-      skill_name: "NoopSkillPolicy"
-      obs_skill_inputs: ["obj_start_sensor"]
-      max_skill_steps: 1
-
-    close_fridge:
-      skill_name: "NoopSkillPolicy"
-      obs_skill_inputs: ["obj_start_sensor"]
-      max_skill_steps: 1
-      apply_postconds: True
-
-    pick:
-      skill_name: "NoopSkillPolicy"
-      obs_skill_inputs: ["obj_start_sensor"]
-      max_skill_steps: 1
-      apply_postconds: True
-      force_end_on_timeout: False
-
-    place:
-      skill_name: "NoopSkillPolicy"
-      obs_skill_inputs: ["obj_goal_sensor"]
-      max_skill_steps: 1
-      apply_postconds: True
-      force_end_on_timeout: False
-
-    nav_to_obj:
-      skill_name: "OracleNavPolicy"
-      obs_skill_inputs: ["obj_start_sensor", "abs_obj_start_sensor", "obj_goal_sensor", "abs_obj_goal_sensor"]
-      max_skill_steps: 300
-
-    wait_skill:
-      skill_name: "WaitSkillPolicy"
-      max_skill_steps: -1
-      force_end_on_timeout: False
-
-    reset_arm_skill:
-      skill_name: "ResetArmSkill"
-      max_skill_steps: 50
-      reset_joint_state: [-4.50e-01, -1.07e00, 9.95e-02, 9.38e-01, -7.88e-04, 1.57e00, 4.62e-03]
-      force_end_on_timeout: False
-
-  use_skills:
-    open_cab: "open_cab"
-    open_fridge: "open_fridge"
-    close_cab: "close_cab"
-    close_fridge: "close_fridge"
-    pick: "pick"
-    place: "place"
-    nav: "nav_to_obj"
-    nav_to_receptacle: "nav_to_obj"
-    wait: "wait_skill"
-    reset_arm: "reset_arm_skill"
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_srl.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_srl.yaml
deleted file mode 100644
index 3ac608e23b..0000000000
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_tp_srl.yaml
+++ /dev/null
@@ -1,71 +0,0 @@
-name: "HierarchicalPolicy"
-obs_transforms:
-  add_virtual_keys:
-    virtual_keys:
-      "goal_to_agent_gps_compass": 2
-hierarchical_policy:
-  high_level_policy:
-    name: "FixedHighLevelPolicy"
-    add_arm_rest: True
-  defined_skills:
-    open_cab:
-      skill_name: "ArtObjSkillPolicy"
-      name: "PointNavResNetPolicy"
-      load_ckpt_file: "data/models/open_cab.pth"
-
-    open_fridge:
-      skill_name: "ArtObjSkillPolicy"
-      skill_name: "ArtObjSkillPolicy"
-      load_ckpt_file: "data/models/open_fridge.pth"
-
-    close_cab:
-      skill_name: "ArtObjSkillPolicy"
-      obs_skill_inputs: ["obj_start_sensor"]
-      load_ckpt_file: "data/models/close_cab.pth"
-
-    close_fridge:
-      skill_name: "ArtObjSkillPolicy"
-      obs_skill_inputs: ["obj_start_sensor"]
-      load_ckpt_file: "data/models/close_fridge.pth"
-
-    pick:
-      skill_name: "PickSkillPolicy"
-      obs_skill_inputs: ["obj_start_sensor"]
-      load_ckpt_file: "data/models/pick.pth"
-
-    place:
-      skill_name: "PlaceSkillPolicy"
-      obs_skill_inputs: ["obj_goal_sensor"]
-      load_ckpt_file: "data/models/place.pth"
-
-    wait_skill:
-      skill_name: "WaitSkillPolicy"
-      max_skill_steps: -1
-      force_end_on_timeout: False
-
-    nav_to_obj:
-      skill_name: "NavSkillPolicy"
-      obs_skill_inputs: ["goal_to_agent_gps_compass"]
-      obs_skill_input_dim: 2
-      load_ckpt_file: "data/models/nav.pth"
-      max_skill_steps: 300
-      force_end_on_timeout: False
-
-    reset_arm_skill:
-      skill_name: "ResetArmSkill"
-      max_skill_steps: 50
-      reset_joint_state: [-4.50e-01, -1.08e00, 9.95e-02, 9.38e-01, -7.88e-04, 1.57e00, 4.62e-03]
-      force_end_on_timeout: False
-
-  use_skills:
-    # Uncomment if you are also using these skills
-    # open_cab: "open_cab"
-    # open_fridge: "open_fridge"
-    # close_cab: "open_cab"
-    # close_fridge: "open_fridge"
-    pick: "pick"
-    place: "place"
-    nav: "nav_to_obj"
-    nav_to_receptacle: "nav_to_obj"
-    wait: "wait_skill"
-    reset_arm: "reset_arm_skill"
\ No newline at end of file
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_fixed.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_fixed.yaml
new file mode 100644
index 0000000000..0aef449dbe
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_fixed.yaml
@@ -0,0 +1,21 @@
+name: "HierarchicalPolicy"
+obs_transforms:
+  add_virtual_keys:
+    virtual_keys:
+      "goal_to_agent_gps_compass": 2
+hierarchical_policy:
+  high_level_policy:
+    name: "FixedHighLevelPolicy"
+    add_arm_rest: True
+  use_skills:
+    open_cab: "open_cab"
+    open_fridge: "open_fridge"
+    close_cab: "close_cab"
+    close_fridge: "close_fridge"
+    pick: "pick"
+    place: "place"
+    nav: "nav_to_obj"
+    nav_to_receptacle: "nav_to_obj"
+    wait: "wait_skill"
+    reset_arm: "reset_arm_skill"
+  defined_skills: {}
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
new file mode 100644
index 0000000000..f15a1eb389
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
@@ -0,0 +1,29 @@
+name: "HierarchicalPolicy"
+obs_transforms:
+  add_virtual_keys:
+    virtual_keys:
+      "goal_to_agent_gps_compass": 2
+hierarchical_policy:
+  high_level_policy:
+    name: "NeuralHighLevelPolicy"
+    allow_other_place: False
+    hidden_dim: 512
+    use_rnn: True
+    rnn_type: 'LSTM'
+    backbone: resnet18
+    normalize_visual_inputs: False
+    num_rnn_layers: 2
+    policy_input_keys:
+      - "robot_head_depth"
+  use_skills:
+    open_cab: "open_cab"
+    open_fridge: "open_fridge"
+    close_cab: "open_cab"
+    close_fridge: "open_fridge"
+    pick: "pick"
+    place: "place"
+    nav: "nav_to_obj"
+    nav_to_receptacle: "nav_to_obj"
+    wait: "wait_skill"
+    reset_arm: "reset_arm_skill"
+  defined_skills: {}
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
new file mode 100644
index 0000000000..df1f0ee015
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
@@ -0,0 +1,66 @@
+# @package _global_
+
+defaults:
+  - /benchmark/rearrange: rearrange_easy
+  - /habitat_baselines: habitat_baselines_rl_config_base
+  - /habitat_baselines/rl/policy/obs_transforms:
+    - add_virtual_keys_base
+  - /habitat_baselines/rl/policy: hl_fixed
+  - /habitat_baselines/rl/policy/hierarchical_policy/defined_skills: nn_skills
+  - _self_
+
+habitat_baselines:
+  verbose: False
+  trainer_name: "ddppo"
+  torch_gpu_id: 0
+  video_fps: 30
+  eval_ckpt_path_dir: ""
+  num_environments: 3
+  writer_type: 'tb'
+  num_updates: -1
+  total_num_steps: 1.0e8
+  log_interval: 10
+  num_checkpoints: 20
+  force_torch_single_threaded: True
+  eval_keys_to_include_in_name: ['reward', 'force', 'composite_success']
+  load_resume_state_config: False
+
+  eval:
+    use_ckpt_config: False
+    should_load_ckpt: False
+    video_option: ["disk"]
+
+  rl:
+    ppo:
+      # ppo params
+      clip_param: 0.2
+      ppo_epoch: 2
+      num_mini_batch: 2
+      value_loss_coef: 0.5
+      entropy_coef: 0.0001
+      lr: 2.5e-4
+      eps: 1e-5
+      max_grad_norm: 0.2
+      num_steps: 128
+      use_gae: True
+      gamma: 0.99
+      tau: 0.95
+
+    ddppo:
+      sync_frac: 0.6
+      # The PyTorch distributed backend to use
+      distrib_backend: NCCL
+      # Visual encoder backbone
+      pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth
+      # Initialize with pretrained weights
+      pretrained: False
+      # Initialize just the visual encoder backbone with pretrained weights
+      pretrained_encoder: False
+      # Whether the visual encoder backbone will be trained.
+      train_encoder: True
+      # Whether to reset the critic linear layer
+      reset_critic: False
+      # Model parameters
+      backbone: resnet18
+      rnn_type: LSTM
+      num_recurrent_layers: 2
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml
similarity index 81%
rename from habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml
rename to habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml
index 5ad654137d..3152f00641 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml
@@ -1,14 +1,12 @@
 # @package _global_
-# Pick and place are instantaneous skills, 
-# where environment is set to final state.
-# Navigation is kinematically simulated.
 
 defaults:
   - /benchmark/rearrange: rearrange_easy
   - /habitat_baselines: habitat_baselines_rl_config_base
   - /habitat_baselines/rl/policy/obs_transforms:
     - add_virtual_keys_base
-  - /habitat_baselines/rl/policy: hierarchical_srl_onav
+  - /habitat_baselines/rl/policy: hl_neural
+  - /habitat_baselines/rl/policy/hierarchical_policy/defined_skills: noop_skills
   - /habitat/task/actions:
     - pddl_apply_action
     - oracle_nav_action
@@ -28,6 +26,7 @@ habitat:
       - obj_goal_gps_compass
       - joint
       - is_holding
+      - ee_pos
       - localization_sensor
 
 
@@ -39,8 +38,6 @@ habitat_baselines:
   updater_name: "HrlPPO"
   distrib_updater_name: "HrlDDPPO"
   video_fps: 30
-  video_render_views:
-    - "third_rgb_sensor"
   eval_ckpt_path_dir: ""
   num_environments: 3
   writer_type: 'tb'
@@ -59,6 +56,16 @@ habitat_baselines:
     video_option: ["disk"]
 
   rl:
+    policy:
+      hierarchical_policy:
+        use_skills:
+          nav: "oracle_nav"
+          nav_to_receptacle: "oracle_nav"
+        defined_skills:
+          oracle_nav:
+            skill_name: "OracleNavPolicy"
+            obs_skill_inputs: ["obj_start_sensor", "abs_obj_start_sensor", "obj_goal_sensor", "abs_obj_goal_sensor"]
+            max_skill_steps: 300
     ppo:
       # ppo params
       clip_param: 0.2
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_skill.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_skill.yaml
index e8908a1062..845561c403 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_skill.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_skill.yaml
@@ -18,7 +18,6 @@ habitat_baselines:
   eval_ckpt_path_dir: "data/new_checkpoints"
   # 26 environments will just barely be below 16gb.
   num_environments: 26
-  checkpoint_folder: "data/new_checkpoints"
   num_updates: -1
   total_num_steps: 1.0e8
   log_interval: 10
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/spap_pick.yaml b/habitat-baselines/habitat_baselines/config/rearrange/spap_pick.yaml
deleted file mode 100644
index 5ba89283e5..0000000000
--- a/habitat-baselines/habitat_baselines/config/rearrange/spap_pick.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# @package _global_
-
-defaults:
-  - /benchmark/rearrange: pick
-  - /habitat_baselines: habitat_baselines_spa_config_base
-  - _self_
-
-habitat_baselines:
-  video_dir: "data/vids/"
-
-  sense_plan_act:
-    verbose: True
-    run_freq: 2
-    n_grasps: 100
-    mp_obj: True
-    mp_margin: null
-    mp_render: True
-    timeout: 3
-    exec_ee_thresh: 0.1
-    # "Priv" or "Reg"
-    mp_sim_type: "Priv"
-    video_dir: 'data/vids'
-    debug_dir: "data/mp_test"
-    count_obj_collisions: True
-    grasp_gen_is_verbose: True
-    ik_dist_thresh: 0.1
-
-  eval:
-    video_option: ["disk"]
-
-habitat:
-  gym:
-    obs_keys: ['joint', 'ee_pos']
-    desired_goal_keys: ['obj_goal_pos_sensor']
-    achieved_goal_keys: []
-    action_keys: ['arm_action']
-  task:
-    success_reward: 2000.0
-    end_on_success: False
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/spap_reach_state.yaml b/habitat-baselines/habitat_baselines/config/rearrange/spap_reach_state.yaml
deleted file mode 100644
index d62f630093..0000000000
--- a/habitat-baselines/habitat_baselines/config/rearrange/spap_reach_state.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# @package _global_
-
-defaults:
-  - /benchmark/rearrange: reach_state
-  - /habitat_baselines: habitat_baselines_spa_config_base
-  - _self_
-
-habitat_baselines:
-  video_dir: "data/vids/"
-
-  sense_plan_act:
-    verbose: True
-    run_freq: 4
-    n_grasps: 100
-    mp_obj: True
-    mp_margin: null
-    mp_render: True
-    timeout: 3
-    exec_ee_thresh: 0.1
-    # "Priv" or "Reg"
-    mp_sim_type: "Priv"
-    video_dir: 'data/vids'
-    debug_dir: "data/mp_test"
-    count_obj_collisions: True
-    grasp_gen_is_verbose: True
-    ik_dist_thresh: 0.1
-    ik_speed_factor: 1.0
-
-  eval:
-    video_option: ["disk"]
-
-habitat:
-  gym:
-    obs_keys: [ 'joint', 'ee_pos' ]
-    desired_goal_keys: [ 'resting_position' ]
-    action_keys: [ 'arm_action' ]
-  task:
-    reward_measure: "rearrange_reach_reward"
-    success_measure: "rearrange_reach_success"
-    success_reward: 10.0
-    slack_reward: -0.01
-    end_on_success: False
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml b/habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml
deleted file mode 100644
index 86394ad477..0000000000
--- a/habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# @package _global_
-
-defaults:
-  - /benchmark/rearrange: rearrange_easy
-  - /habitat_baselines: habitat_baselines_rl_config_base
-  - /habitat_baselines/rl/policy/obs_transforms:
-    - add_virtual_keys_base
-  - /habitat_baselines/rl/policy: hierarchical_srl_learned_nav
-  - /habitat/simulator/sim_sensors@habitat_baselines.eval.extra_sim_sensors.third_rgb_sensor: third_rgb_sensor
-  - _self_
-
-
-habitat_baselines:
-  verbose: False
-  trainer_name: "ddppo"
-  torch_gpu_id: 0
-  tensorboard_dir: "tb"
-  video_dir: "video_dir"
-  video_fps: 30
-  test_episode_count: -1
-  eval_ckpt_path_dir: ""
-  num_environments: 1
-  writer_type: 'tb'
-  checkpoint_folder: "data/new_checkpoints"
-  num_updates: -1
-  total_num_steps: 1.0e8
-  log_interval: 10
-  num_checkpoints: 20
-  force_torch_single_threaded: True
-  eval_keys_to_include_in_name: ['reward', 'force', 'composite_success']
-  load_resume_state_config: False
-  eval:
-    use_ckpt_config: False
-    should_load_ckpt: False
-    video_option: ["disk"]
-
-  rl:
-    ddppo:
-      pretrained: False
-      pretrained_encoder: False
-      train_encoder: True
-      reset_critic: False
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml b/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml
deleted file mode 100644
index 55e684eec3..0000000000
--- a/habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# @package _global_
-
-defaults:
-  - /benchmark/rearrange: rearrange_easy
-  - /habitat_baselines: habitat_baselines_rl_config_base
-  - /habitat_baselines/rl/policy/obs_transforms:
-    - add_virtual_keys_base
-  - /habitat_baselines/rl/policy: hierarchical_srl_oracle_nav
-  - /habitat/task/actions:
-    - oracle_nav_action
-    - arm_action
-    - base_velocity
-    - rearrange_stop
-  - /habitat/simulator/sim_sensors@habitat_baselines.eval.extra_sim_sensors.third_rgb_sensor: third_rgb_sensor
-  - _self_
-
-habitat:
-  gym:
-    auto_name: RearrangeEasy
-    obs_keys:
-      - robot_head_depth
-      - relative_resting_position
-      - obj_start_sensor
-      - obj_goal_sensor
-      - obj_start_gps_compass
-      - obj_goal_gps_compass
-      - joint
-      - is_holding
-      - localization_sensor
-
-
-habitat_baselines:
-  verbose: False
-  trainer_name: "ddppo"
-  torch_gpu_id: 0
-  tensorboard_dir: "tb"
-  video_dir: "video_dir"
-  video_fps: 30
-  test_episode_count: -1
-  eval_ckpt_path_dir: ""
-  num_environments: 1
-  writer_type: 'tb'
-  checkpoint_folder: "data/new_checkpoints"
-  num_updates: -1
-  total_num_steps: 1.0e8
-  log_interval: 10
-  num_checkpoints: 20
-  force_torch_single_threaded: True
-  eval_keys_to_include_in_name: ['reward', 'force', 'composite_success']
-  load_resume_state_config: False
-  eval:
-    use_ckpt_config: False
-    should_load_ckpt: False
-    video_option: ["disk"]
-
-  rl:
-    ddppo:
-      pretrained: False
-      pretrained_encoder: False
-      train_encoder: True
-      reset_critic: False
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
index ad0226262e..ab8f9bd420 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
@@ -93,9 +93,6 @@ def __init__(
             self._name_to_idx[skill_id] = i
             self._idx_to_name[i] = skill_id
 
-        self._call_high_level: torch.Tensor = torch.ones(
-            self._num_envs, dtype=torch.bool
-        )
         self._cur_skills: torch.Tensor = torch.full(
             (self._num_envs,), -1, dtype=torch.long
         )
@@ -118,6 +115,13 @@ def __init__(
     def eval(self):
         pass
 
+    def get_policy_action_space(
+        self, env_action_space: spaces.Space
+    ) -> spaces.Space:
+        return self._high_level_policy.get_policy_action_space(
+            env_action_space
+        )
+
     def extract_policy_info(
         self, action_data, infos, dones
     ) -> List[Dict[str, float]]:
@@ -142,7 +146,10 @@ def extract_policy_info(
 
     @property
     def num_recurrent_layers(self):
-        return self._high_level_policy.num_recurrent_layers
+        if self._high_level_policy.num_recurrent_layers != 0:
+            return self._high_level_policy.num_recurrent_layers
+        else:
+            return self._skills[0].num_recurrent_layers
 
     @property
     def should_load_agent_state(self):
@@ -194,11 +201,11 @@ def act(
         masks,
         deterministic=False,
     ):
-
+        masks_cpu = masks.cpu()
         log_info: List[Dict[str, Any]] = [{} for _ in range(self._num_envs)]
-        self._high_level_policy.apply_mask(masks)  # type: ignore[attr-defined]
+        self._high_level_policy.apply_mask(masks_cpu)  # type: ignore[attr-defined]
 
-        should_terminate: torch.BoolTensor = torch.zeros(
+        call_high_level: torch.BoolTensor = torch.zeros(
             (self._num_envs,), dtype=torch.bool
         )
         bad_should_terminate: torch.BoolTensor = torch.zeros(
@@ -212,7 +219,7 @@ def act(
             masks,
             self._cur_skills,
             log_info,
-        ).cpu()
+        )
         # Compute the actions from the current skills
         actions = torch.zeros(
             (self._num_envs, get_num_actions(self._action_space)),
@@ -237,14 +244,14 @@ def act(
         for skill_id, (batch_ids, dat) in grouped_skills.items():
             if skill_id == -1:
                 # Policy has not prediced a skill yet.
-                should_terminate[batch_ids] = 1.0
+                call_high_level[batch_ids] = 1.0
                 continue
             # TODO: either change name of the function or assign actions somewhere
-            # else. Updating actions in should_terminate is counterintuitive 
+            # else. Updating actions in should_terminate is counterintuitive
             (
-                should_terminate[batch_ids],
+                call_high_level[batch_ids],
                 bad_should_terminate[batch_ids],
-                actions[batch_ids]
+                actions[batch_ids],
             ) = self._skills[skill_id].should_terminate(
                 **dat,
                 batch_idx=batch_ids,
@@ -254,16 +261,14 @@ def act(
                     for i in batch_ids
                 ],
             )
-        self._call_high_level = should_terminate
-
         # Always call high-level if the episode is over.
-        self._call_high_level = self._call_high_level | (~masks).view(-1).cpu()
+        call_high_level = call_high_level | (~masks_cpu).view(-1)
 
         # If any skills want to terminate invoke the high-level policy to get
         # the next skill.
         hl_terminate = torch.zeros(self._num_envs, dtype=torch.bool)
         hl_info: Dict[str, Any] = {}
-        if self._call_high_level.sum() > 0:
+        if call_high_level.sum() > 0:
             (
                 new_skills,
                 new_skill_args,
@@ -274,7 +279,7 @@ def act(
                 rnn_hidden_states,
                 prev_actions,
                 masks,
-                self._call_high_level,
+                call_high_level,
                 deterministic,
                 log_info,
             )
@@ -282,7 +287,7 @@ def act(
             sel_grouped_skills = self._broadcast_skill_ids(
                 new_skills,
                 sel_dat={},
-                should_adds=self._call_high_level,
+                should_adds=call_high_level,
             )
 
             for skill_id, (batch_ids, _) in sel_grouped_skills.items():
@@ -293,11 +298,16 @@ def act(
                     rnn_hidden_states,
                     prev_actions,
                 )
-                rnn_hidden_states[batch_ids] *= 0.0
-                prev_actions[batch_ids] *= 0
-            self._cur_skills = (
-                (~self._call_high_level) * self._cur_skills
-            ) + (self._call_high_level * new_skills)
+                if "rnn_hidden_states" not in hl_info:
+                    rnn_hidden_states[batch_ids] *= 0.0
+                    prev_actions[batch_ids] *= 0
+                elif self._skills[skill_id].num_recurrent_layers != 0:
+                    raise ValueError(
+                        f"The code does not currently support neural LL and neural HL skills. Skill={self._skills[skill_id]}, HL={self._high_level_policy}"
+                    )
+            self._cur_skills = ((~call_high_level) * self._cur_skills) + (
+                call_high_level * new_skills
+            )
 
         grouped_skills = self._broadcast_skill_ids(
             self._cur_skills,
@@ -332,16 +342,43 @@ def act(
                 )
                 actions[batch_idx, self._stop_action_idx] = 1.0
 
-        ac_info = {
-            "values": None,
-            "action_log_probs": None,
+        action_kwargs = {
             "rnn_hidden_states": rnn_hidden_states,
             "actions": actions,
         }
-        ac_info.update(hl_info)
+        action_kwargs.update(hl_info)
 
         return PolicyAction(
-            take_actions=actions, policy_info=log_info, **ac_info
+            take_actions=actions,
+            policy_info=log_info,
+            should_inserts=call_high_level,
+            **action_kwargs,
+        )
+
+    def get_value(self, observations, rnn_hidden_states, prev_actions, masks):
+        return self._high_level_policy.get_value(
+            observations, rnn_hidden_states, prev_actions, masks
+        )
+
+    def _get_policy_components(self) -> Tuple[nn.Module]:
+        return self._high_level_policy.get_policy_components()
+
+    def evaluate_actions(
+        self,
+        observations,
+        rnn_hidden_states,
+        prev_actions,
+        masks,
+        action,
+        rnn_build_seq_info: Dict[str, torch.Tensor],
+    ):
+        return self._high_level_policy.evaluate_actions(
+            observations,
+            rnn_hidden_states,
+            prev_actions,
+            masks,
+            action,
+            rnn_build_seq_info,
         )
 
     @classmethod
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
index 5bbc2726ec..6d1eb7bf50 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
@@ -8,6 +8,10 @@
 
 
 class HighLevelPolicy(nn.Module):
+    """
+    High level policy that selects from low-level skills.
+    """
+
     def __init__(
         self,
         config,
@@ -27,6 +31,21 @@ def __init__(
 
     def to(self, device):
         self._device = device
+        return super().to(device)
+
+    def get_value(self, observations, rnn_hidden_states, prev_actions, masks):
+        raise NotImplementedError()
+
+    def evaluate_actions(
+        self,
+        observations,
+        rnn_hidden_states,
+        prev_actions,
+        masks,
+        action,
+        rnn_build_seq_info,
+    ):
+        raise NotImplementedError()
 
     @property
     def num_recurrent_layers(self):
@@ -35,6 +54,11 @@ def num_recurrent_layers(self):
     def parameters(self):
         return iter([nn.Parameter(torch.zeros((1,), device=self._device))])
 
+    def get_policy_action_space(
+        self, env_action_space: spaces.Space
+    ) -> spaces.Space:
+        return env_action_space
+
     def get_next_skill(
         self,
         observations,
@@ -69,6 +93,9 @@ def get_next_skill(
     def apply_mask(self, mask: torch.Tensor) -> None:
         pass
 
+    def get_policy_components(self) -> Tuple[nn.Module]:
+        return tuple()
+
     def get_termination(
         self,
         observations,
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
index c855e4fcc5..f8796f08a3 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
@@ -1,5 +1,5 @@
 from itertools import chain
-from typing import Any, List
+from typing import Any, List, Tuple
 
 import gym.spaces as spaces
 import numpy as np
@@ -92,6 +92,11 @@ def _setup_actions(self) -> List[PddlAction]:
             ]
         return all_actions
 
+    def get_policy_action_space(
+        self, env_action_space: spaces.Space
+    ) -> spaces.Space:
+        return spaces.Discrete(self._n_actions)
+
     @property
     def num_recurrent_layers(self):
         return self._state_encoder.num_recurrent_layers
@@ -105,7 +110,10 @@ def parameters(self):
             self._critic.parameters(),
         )
 
-    def forward(self, obs, rnn_hidden_states, masks):
+    def get_policy_components(self) -> Tuple[nn.Module]:
+        return (self,)
+
+    def forward(self, obs, rnn_hidden_states, masks, rnn_build_seq_info=None):
         hidden = []
         if len(self._im_obs_space) > 0:
             im_obs = {k: obs[k] for k in self._im_obs_space.keys()}
@@ -117,7 +125,9 @@ def forward(self, obs, rnn_hidden_states, masks):
             hidden.extend([obs[k] for k in self._state_obs_space.keys()])
         hidden = torch.cat(hidden, -1)
 
-        return self._state_encoder(hidden, rnn_hidden_states, masks)
+        return self._state_encoder(
+            hidden, rnn_hidden_states, masks, rnn_build_seq_info
+        )
 
     def to(self, device):
         self._device = device
@@ -127,6 +137,30 @@ def get_value(self, observations, rnn_hidden_states, prev_actions, masks):
         state, _ = self.forward(observations, rnn_hidden_states, masks)
         return self._critic(state)
 
+    def evaluate_actions(
+        self,
+        observations,
+        rnn_hidden_states,
+        prev_actions,
+        masks,
+        action,
+        rnn_build_seq_info,
+    ):
+        features, _ = self.forward(
+            observations, rnn_hidden_states, masks, rnn_build_seq_info
+        )
+        distribution = self._policy(features)
+        value = self._critic(features)
+        action_log_probs = distribution.log_probs(action)
+        distribution_entropy = distribution.entropy()
+
+        return (
+            value,
+            action_log_probs,
+            distribution_entropy,
+            rnn_hidden_states,
+        )
+
     def get_next_skill(
         self,
         observations,
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
index 706f2b9a57..a0cb2222a7 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
@@ -28,13 +28,7 @@ def reduce_loss(loss):
 
         self._set_grads_to_none()
 
-        (
-            values,
-            action_log_probs,
-            dist_entropy,
-            _,
-            aux_loss_res,
-        ) = self._evaluate_actions(
+        (values, action_log_probs, dist_entropy, _,) = self._evaluate_actions(
             batch["observations"],
             batch["recurrent_hidden_states"],
             batch["prev_actions"],
@@ -87,8 +81,6 @@ def reduce_loss(loss):
         else:
             all_losses.append(self.entropy_coef.lagrangian_loss(dist_entropy))
 
-        all_losses.extend(v["loss"] for v in aux_loss_res.values())
-
         total_loss = torch.stack(all_losses).sum()
 
         total_loss = self.before_backward(total_loss)
@@ -118,10 +110,6 @@ def reduce_loss(loss):
                     self.entropy_coef().detach()
                 )
 
-            for name, res in aux_loss_res.items():
-                for k, v in res.items():
-                    learner_metrics[f"aux_{name}_{k}"].append(v.detach())
-
 
 @baseline_registry.register_updater
 class HrlDDPPO(DecentralizedDistributedMixin, HrlPPO):
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
index 7d728afdfa..6d8491e592 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
@@ -11,6 +11,10 @@
 from habitat_baselines.common.baseline_registry import baseline_registry
 from habitat_baselines.common.rollout_storage import RolloutStorage
 from habitat_baselines.common.tensor_dict import TensorDict
+from habitat_baselines.rl.models.rnn_state_encoder import (
+    build_pack_info_from_dones,
+    build_rnn_build_seq_info,
+)
 
 EPS_PPO = 1e-5
 
@@ -20,7 +24,6 @@ class HrlRolloutStorage(RolloutStorage):
     def __init__(self, numsteps, num_envs, *args, **kwargs):
         super().__init__(numsteps, num_envs, *args, **kwargs)
         self._num_envs = num_envs
-        self._numsteps = numsteps
         self._cur_step_idxs = torch.zeros(self._num_envs, dtype=torch.long)
         self._last_should_inserts = None
         if self.is_double_buffered:
@@ -40,6 +43,10 @@ def insert(
         buffer_index: int = 0,
         should_inserts=None,
     ):
+        if next_masks is not None:
+            next_masks = next_masks.to(self.device)
+        if rewards is not None:
+            rewards = rewards.to(self.device)
         next_step = dict(
             observations=next_observations,
             recurrent_hidden_states=next_recurrent_hidden_states,
@@ -69,9 +76,7 @@ def insert(
             reward_write_idxs = torch.maximum(
                 self._cur_step_idxs - 1, torch.zeros_like(self._cur_step_idxs)
             )
-            self.buffers["rewards"][reward_write_idxs, env_idxs] += rewards.to(
-                self.device
-            )
+            self.buffers["rewards"][reward_write_idxs, env_idxs] += rewards
 
         if len(next_step) > 0:
             self.buffers.set(
@@ -97,9 +102,9 @@ def insert(
     def advance_rollout(self, buffer_index: int = 0):
         self._cur_step_idxs += self._last_should_inserts.long()
 
-        is_past_buffer = self._cur_step_idxs >= self._numsteps
+        is_past_buffer = self._cur_step_idxs >= self.num_steps
         if is_past_buffer.sum() > 0:
-            self._cur_step_idxs[is_past_buffer] = self._numsteps - 1
+            self._cur_step_idxs[is_past_buffer] = self.num_steps - 1
             env_idxs = torch.arange(self._num_envs)
             self.buffers["rewards"][
                 self._cur_step_idxs[is_past_buffer], env_idxs[is_past_buffer]
@@ -139,14 +144,20 @@ def recurrent_generator(
         self, advantages, num_batches
     ) -> Iterator[TensorDict]:
         num_environments = advantages.size(1)
+        dones_cpu = (
+            torch.logical_not(self.buffers["masks"])
+            .cpu()
+            .view(-1, self._num_envs)
+            .numpy()
+        )
         for inds in torch.randperm(num_environments).chunk(num_batches):
-            batch = self.buffers[0 : self._numsteps, inds]
-            batch["advantages"] = advantages[: self._numsteps, inds]
+            batch = self.buffers[0 : self.num_steps, inds]
+            batch["advantages"] = advantages[: self.num_steps, inds]
             batch["recurrent_hidden_states"] = batch[
                 "recurrent_hidden_states"
             ][0:1]
             batch["loss_mask"] = (
-                torch.arange(self._numsteps, device=advantages.device)
+                torch.arange(self.num_steps, device=advantages.device)
                 .view(-1, 1, 1)
                 .repeat(1, len(inds), 1)
             )
@@ -156,4 +167,14 @@ def recurrent_generator(
                     batch["loss_mask"][:, i] < self._cur_step_idxs[env_i] - 1
                 )
 
-            yield batch.map(lambda v: v.flatten(0, 1))
+            batch.map_in_place(lambda v: v.flatten(0, 1))
+            batch["rnn_build_seq_info"] = build_rnn_build_seq_info(
+                device=self.device,
+                build_fn_result=build_pack_info_from_dones(
+                    dones_cpu[0 : self.num_steps, inds.numpy()].reshape(
+                        -1, len(inds)
+                    ),
+                ),
+            )
+
+            yield batch.to_tree()
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/art_obj.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/art_obj.py
index af083ef84a..b3211760c8 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/art_obj.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/art_obj.py
@@ -66,4 +66,4 @@ def _is_skill_done(
 
     def _parse_skill_arg(self, skill_arg):
         self._internal_log(f"Parsing skill argument {skill_arg}")
-        return int(skill_arg[-1].split("|")[1])
+        return int(skill_arg[1].split("|")[1])
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py
index ed153443f6..3478613fb1 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py
@@ -134,14 +134,16 @@ def _internal_act(
             masks,
             deterministic,
         )
-        full_action = torch.zeros(prev_actions.shape, device=masks.device)
+        full_action = torch.zeros(
+            (masks.shape[0], self._full_ac_size), device=masks.device
+        )
         full_action[
             :, self._ac_start : self._ac_start + self._ac_len
         ] = action_data.actions
         action_data.actions = full_action
 
         self._did_want_done[cur_batch_idx] = full_action[
-            cur_batch_idx, self._stop_action_idx
+            :, self._stop_action_idx
         ]
         return action_data
 
@@ -176,6 +178,10 @@ def from_config(
         )
 
         for k in config.obs_skill_inputs:
+            if k not in filtered_obs_space.spaces:
+                raise ValueError(
+                    f"Could not find {k} for skill {policy_cfg.habitat.gym.auto_name}"
+                )
             space = filtered_obs_space.spaces[k]
             # There is always a 3D position
             filtered_obs_space.spaces[k] = truncate_obs_space(space, 3)
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
index 5a27d3f527..2208084edc 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
@@ -169,6 +169,9 @@ def _parse_skill_arg(self, skill_arg):
     def _get_multi_sensor_index(self, batch_idx):
         return [self._cur_skill_args[i].target_idx for i in batch_idx]
 
+    def requires_rnn_state(self):
+        return False
+
     def _internal_act(
         self,
         observations,
@@ -178,7 +181,9 @@ def _internal_act(
         cur_batch_idx,
         deterministic=False,
     ):
-        full_action = torch.zeros(prev_actions.shape, device=masks.device)
+        full_action = torch.zeros(
+            (masks.shape[0], self._full_ac_size), device=masks.device
+        )
         action_idxs = torch.FloatTensor(
             [self._cur_skill_args[i].action_idx + 1 for i in cur_batch_idx]
         )
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
index 7d17aead32..5c1ab5e23a 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
@@ -9,8 +9,8 @@
 import torch
 
 from habitat_baselines.rl.hrl.skills.skill import SkillPolicy
+from habitat_baselines.rl.hrl.utils import find_action_range
 from habitat_baselines.rl.ppo.policy import PolicyAction
-from habitat_baselines.utils.common import get_num_actions
 
 
 class ResetArmSkill(SkillPolicy):
@@ -23,12 +23,7 @@ def __init__(
         super().__init__(config, action_space, batch_size, True)
         self._target = np.array([float(x) for x in config.reset_joint_state])
 
-        self._ac_start = 0
-        for k, space in action_space.items():
-            if k != "arm_action":
-                self._ac_start += get_num_actions(space)
-            else:
-                break
+        self._arm_ac_range = find_action_range(action_space, "arm_action")
 
     def on_enter(
         self,
@@ -85,14 +80,16 @@ def _internal_act(
         # always in [-1,1] and has the benefit of reducing the delta
         # amount was we converge to the target.
         delta = delta / np.maximum(
-            self._initial_delta.max(-1, keepdims=True), 1e-5
+            self._initial_delta[cur_batch_idx].max(-1, keepdims=True), 1e-5
         )
 
         action = torch.zeros_like(prev_actions)
 
-        action[..., self._ac_start : self._ac_start + 7] = torch.from_numpy(
-            delta
-        ).to(device=action.device, dtype=action.dtype)
+        action[
+            ..., self._arm_ac_range[0] : self._arm_ac_range[1]
+        ] = torch.from_numpy(delta).to(
+            device=action.device, dtype=action.dtype
+        )
 
         return PolicyAction(
             actions=action, rnn_hidden_states=rnn_hidden_states
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index a0ca5854b5..fff04bf3d9 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -37,6 +37,7 @@ def __init__(
         self._raw_skill_args: List[Optional[str]] = [
             None for _ in range(self._batch_size)
         ]
+        self._full_ac_size = get_num_actions(action_space)
 
         # TODO: for some reason this doesnt work with "pddl_apply_action" in action_space
         # and needs to go through the keys argument
@@ -155,13 +156,13 @@ def should_terminate(
         batch_idx: List[int],
         skill_name: List[str],
         log_info,
-    ) -> Tuple[torch.BoolTensor, torch.BoolTensor]:
+    ) -> Tuple[torch.BoolTensor, torch.BoolTensor, torch.Tensor]:
         """
         :returns: A (batch_size,) size tensor where 1 indicates the skill wants to end and 0 if not.
         """
         is_skill_done = self._is_skill_done(
             observations, rnn_hidden_states, prev_actions, masks, batch_idx
-        ).cpu()
+        )
         if is_skill_done.sum() > 0:
             self._internal_log(
                 f"Requested skill termination {is_skill_done}",
@@ -182,9 +183,9 @@ def should_terminate(
         if self._config.max_skill_steps > 0:
             over_max_len = cur_skill_step > self._config.max_skill_steps
             if self._config.force_end_on_timeout:
-                bad_terminate = over_max_len.cpu()
+                bad_terminate = over_max_len
             else:
-                is_skill_done = is_skill_done | over_max_len.cpu()
+                is_skill_done = is_skill_done | over_max_len
         new_actions = torch.zeros_like(actions)
         for i, env_i in enumerate(batch_idx):
             if self._delay_term[env_i]:
@@ -200,15 +201,15 @@ def should_terminate(
                 )
                 self._delay_term[env_i] = True
                 is_skill_done[i] = 0.0
-        
-        is_skill_done |= hl_says_term.cpu()
+
+        is_skill_done |= hl_says_term
 
         if bad_terminate.sum() > 0:
             self._internal_log(
                 f"Bad terminating due to timeout {cur_skill_step}, {bad_terminate}",
                 observations,
             )
-        return is_skill_done.cpu(), bad_terminate.cpu(), new_actions
+        return is_skill_done, bad_terminate, new_actions
 
     def on_enter(
         self,
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py
index e8a7ca03e1..caab4194ee 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py
@@ -40,7 +40,9 @@ def _internal_act(
         cur_batch_idx,
         deterministic=False,
     ):
-        action = torch.zeros(prev_actions.shape, device=prev_actions.device)
+        action = torch.zeros(
+            (masks.shape[0], self._full_ac_size), device=prev_actions.device
+        )
         return PolicyAction(
             actions=action, rnn_hidden_states=rnn_hidden_states
         )
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/policy.py b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
index 22b7d95205..07da61345f 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/policy.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
@@ -5,7 +5,16 @@
 # LICENSE file in the root directory of this source tree.
 import abc
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
 
 import torch
 from gym import spaces
@@ -33,8 +42,12 @@
 
 @dataclass
 class PolicyAction:
+    """
+    Information returned from the `Policy` class.
+    """
+
     rnn_hidden_states: torch.Tensor
-    actions: torch.Tensor
+    actions: Optional[torch.Tensor] = None
     values: Optional[torch.Tensor] = None
     action_log_probs: Optional[torch.Tensor] = None
     take_actions: Optional[torch.Tensor] = None
@@ -69,6 +82,26 @@ def num_recurrent_layers(self) -> int:
     def forward(self, *x):
         raise NotImplementedError
 
+    def get_policy_action_space(
+        self, env_action_space: spaces.Space
+    ) -> spaces.Space:
+        return env_action_space
+
+    def _get_policy_components(self) -> Tuple[nn.Module]:
+        return tuple()
+
+    def aux_loss_parameters(self) -> Dict[str, Iterable[torch.Tensor]]:
+        return {}
+
+    def policy_parameters(self) -> Iterable[torch.Tensor]:
+        for c in self._get_policy_components():
+            yield from c.parameters()
+
+    def all_policy_tensors(self) -> Iterable[torch.Tensor]:
+        yield from self.policy_parameters()
+        for c in self._get_policy_components():
+            yield from c.buffers()
+
     def extract_policy_info(
         self, action_data: PolicyAction, infos, dones
     ) -> List[Dict[str, float]]:
@@ -235,19 +268,9 @@ def evaluate_actions(
             aux_loss_res,
         )
 
-    @property
-    def policy_components(self):
+    def _get_policy_components(self) -> Tuple[nn.Module]:
         return (self.net, self.critic, self.action_distribution)
 
-    def policy_parameters(self) -> Iterable[torch.Tensor]:
-        for c in self.policy_components:
-            yield from c.parameters()
-
-    def all_policy_tensors(self) -> Iterable[torch.Tensor]:
-        yield from self.policy_parameters()
-        for c in self.policy_components:
-            yield from c.buffers()
-
     def aux_loss_parameters(self) -> Dict[str, Iterable[torch.Tensor]]:
         return {k: v.parameters() for k, v in self.aux_loss_modules.items()}
 
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
index 2559085cba..1adbc11c38 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
@@ -154,8 +154,8 @@ def _setup_actor_critic_agent(self, ppo_cfg: "DictConfig") -> None:
         self.actor_critic = policy.from_config(
             self.config,
             observation_space,
-            self.policy_action_space,
-            orig_action_space=self.orig_policy_action_space,
+            self.env_action_space,
+            orig_action_space=self.orig_env_action_space,
         )
         self.obs_space = observation_space
         self.actor_critic.to(self.device)
@@ -205,6 +205,9 @@ def _setup_actor_critic_agent(self, ppo_cfg: "DictConfig") -> None:
             )
 
         self.agent = agent_cls.from_config(self.actor_critic, ppo_cfg)
+        self.policy_action_space = self.actor_critic.get_policy_action_space(
+            self.envs.action_spaces[0]
+        )
 
     def _init_envs(self, config=None, is_eval: bool = False):
         if config is None:
@@ -215,6 +218,8 @@ def _init_envs(self, config=None, is_eval: bool = False):
             workers_ignore_signals=is_slurm_batch_job(),
             enforce_scenes_greater_eq_environments=is_eval,
         )
+        self.env_action_space = self.envs.action_spaces[0]
+        self.orig_env_action_space = self.envs.orig_action_spaces[0]
 
     def _init_train(self, resume_state=None):
         if resume_state is None:
@@ -286,18 +291,6 @@ def _init_train(self, resume_state=None):
 
         self._init_envs()
 
-        action_space = self.envs.action_spaces[0]
-        self.policy_action_space = action_space
-        self.orig_policy_action_space = self.envs.orig_action_spaces[0]
-        if is_continuous_action_space(action_space):
-            # Assume ALL actions are NOT discrete
-            action_shape = (get_num_actions(action_space),)
-            discrete_actions = False
-        else:
-            # For discrete pointnav
-            action_shape = (1,)
-            discrete_actions = True
-
         ppo_cfg = self.config.habitat_baselines.rl.ppo
         if torch.cuda.is_available():
             self.device = torch.device(
@@ -353,8 +346,6 @@ def _init_train(self, resume_state=None):
             ppo_cfg.hidden_size,
             num_recurrent_layers=self.actor_critic.num_recurrent_layers,
             is_double_buffered=ppo_cfg.use_double_buffered_sampler,
-            action_shape=action_shape,
-            discrete_actions=discrete_actions,
         )
         self.rollouts.to(self.device)
 
@@ -463,12 +454,12 @@ def _compute_actions_and_step_envs(self, buffer_index: int = 0):
             range(env_slice.start, env_slice.stop),
             action_data.env_actions.cpu().unbind(0),
         ):
-            if is_continuous_action_space(self.policy_action_space):
+            if is_continuous_action_space(self.env_action_space):
                 # Clipping actions to the specified limits
                 act = np.clip(
                     act.numpy(),
-                    self.policy_action_space.low,
-                    self.policy_action_space.high,
+                    self.env_action_space.low,
+                    self.env_action_space.high,
                 )
             else:
                 act = act.item()
@@ -934,20 +925,16 @@ def _eval_checkpoint(
 
         self._init_envs(config, is_eval=True)
 
-        action_space = self.envs.action_spaces[0]
-        self.policy_action_space = action_space
-        self.orig_policy_action_space = self.envs.orig_action_spaces[0]
-        if is_continuous_action_space(action_space):
+        self._setup_actor_critic_agent(ppo_cfg)
+        if is_continuous_action_space(self.policy_action_space):
             # Assume NONE of the actions are discrete
-            action_shape = (get_num_actions(action_space),)
+            action_shape = (get_num_actions(self.policy_action_space),)
             discrete_actions = False
         else:
             # For discrete pointnav
             action_shape = (1,)
             discrete_actions = True
 
-        self._setup_actor_critic_agent(ppo_cfg)
-
         if self.agent.actor_critic.should_load_agent_state:
             self.agent.load_state_dict(ckpt_dict["state_dict"])
         self.actor_critic = self.agent.actor_critic
@@ -1027,19 +1014,30 @@ def _eval_checkpoint(
                     not_done_masks,
                     deterministic=False,
                 )
-                test_recurrent_hidden_states = action_data.rnn_hidden_states
-
-                prev_actions.copy_(action_data.actions)  # type: ignore
+                if action_data.should_inserts is None:
+                    test_recurrent_hidden_states = (
+                        action_data.rnn_hidden_states
+                    )
+                    prev_actions.copy_(action_data.actions)  # type: ignore
+                else:
+                    for i, should_insert in enumerate(
+                        action_data.should_inserts
+                    ):
+                        if should_insert.item():
+                            test_recurrent_hidden_states[
+                                i
+                            ] = action_data.rnn_hidden_states[i]
+                            prev_actions[i].copy_(action_data.actions[i])  # type: ignore
             # NB: Move actions to CPU.  If CUDA tensors are
             # sent in to env.step(), that will create CUDA contexts
             # in the subprocesses.
-            if is_continuous_action_space(self.policy_action_space):
+            if is_continuous_action_space(self.env_action_space):
                 # Clipping actions to the specified limits
                 step_data = [
                     np.clip(
                         a.numpy(),
-                        self.policy_action_space.low,
-                        self.policy_action_space.high,
+                        self.env_action_space.low,
+                        self.env_action_space.high,
                     )
                     for a in action_data.env_actions.cpu()
                 ]
diff --git a/habitat-baselines/tb/events.out.tfevents.1674193555.s-164-67-210-79.resnet.ucla.edu.58058.0 b/habitat-baselines/tb/events.out.tfevents.1674193555.s-164-67-210-79.resnet.ucla.edu.58058.0
new file mode 100644
index 0000000000000000000000000000000000000000..11853e6cc35c6fa02416a8cca40abb3d004e18ad
GIT binary patch
literal 40
rcmb1OfPlsI-b$P&Paay*_34JA6mL>dVrHJ6YguYuiIvgv2lx8`_;n8f

literal 0
HcmV?d00001

diff --git a/habitat-lab/habitat/config/benchmark/rearrange/rearrange_easy.yaml b/habitat-lab/habitat/config/benchmark/rearrange/rearrange_easy.yaml
index a9f5b8b68d..7b5995e431 100644
--- a/habitat-lab/habitat/config/benchmark/rearrange/rearrange_easy.yaml
+++ b/habitat-lab/habitat/config/benchmark/rearrange/rearrange_easy.yaml
@@ -19,6 +19,7 @@ habitat:
       - obj_goal_gps_compass
       - joint
       - is_holding
+      - ee_pos
   environment:
     max_episode_steps: 1500
   simulator:
diff --git a/test/test_baseline_training.py b/test/test_baseline_training.py
index 43a72477f7..52f0242b01 100644
--- a/test/test_baseline_training.py
+++ b/test/test_baseline_training.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import glob
+import itertools
 import os
 import random
 
@@ -150,31 +151,35 @@ def test_trainers(config_path, num_updates, overrides, trainer_name):
 
 
 @pytest.mark.parametrize(
-    "config_path,mode,trainer_name",
-    [
-        (
-            "habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml",
-            "train",
-            "ppo",
-        ),
-        # (
-        #     "habitat-baselines/habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml",
-        #     "eval",
-        #     "ppo",
-        # ),
-        # (
-        #     "habitat-baselines/habitat_baselines/config/rearrange/tp_srl_oracle_nav.yaml",
-        #     "eval",
-        #     "ppo",
-        # ),
-        # (
-        #     "habitat-baselines/habitat_baselines/config/rearrange/tp_srl.yaml",
-        #     "eval",
-        #     "ppo",
-        # ),
-    ],
+    "config_path,policy_type,skill_type,mode",
+    list(
+        itertools.product(
+            [
+                "habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml",
+                "habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml",
+            ],
+            [
+                "hl_neural",
+                "hl_fixed",
+            ],
+            [
+                "nn_skills",
+                "noop_skills",
+            ],
+            [
+                "eval",
+                "train",
+            ],
+        )
+    ),
 )
-def test_hrl(config_path, mode, trainer_name):
+def test_hrl(config_path, policy_type, skill_type, mode):
+    if policy_type == "hl_neural" and skill_type == "nn_skills":
+        return
+    if policy_type == "hl_fixed" and mode == "train":
+        return
+    if skill_type == "noop_skills" and "oracle" not in config_path:
+        return
     # Remove the checkpoints from previous tests
     for f in glob.glob("data/test_checkpoints/test_training/*"):
         os.remove(f)
@@ -189,7 +194,8 @@ def test_hrl(config_path, mode, trainer_name):
             "habitat_baselines.total_num_steps=-1.0",
             "habitat_baselines.test_episode_count=1",
             "habitat_baselines.checkpoint_folder=data/test_checkpoints/test_training",
-            f"habitat_baselines.trainer_name={trainer_name}",
+            f"habitat_baselines/rl/policy={policy_type}",
+            f"habitat_baselines/rl/policy/hierarchical_policy/defined_skills={skill_type}",
         ],
     )
     with read_write(config):

From fa94bfc56d23430651e7af68aa1bb352262defdd Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Fri, 20 Jan 2023 10:20:29 -0800
Subject: [PATCH 09/48] Fixed tests

---
 .../habitat_baselines/rl/hrl/hierarchical_policy.py           | 1 +
 habitat-baselines/habitat_baselines/rl/hrl/skills/nav.py      | 4 ++--
 habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py    | 3 ++-
 habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py    | 4 +++-
 habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py     | 2 +-
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
index ab8f9bd420..5e6da4ea32 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
@@ -261,6 +261,7 @@ def act(
                     for i in batch_ids
                 ],
             )
+
         # Always call high-level if the episode is over.
         call_high_level = call_high_level | (~masks_cpu).view(-1)
 
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/nav.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/nav.py
index 834ea862f6..642aaf9a69 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/nav.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/nav.py
@@ -47,8 +47,8 @@ def _get_filtered_obs(self, observations, cur_batch_idx) -> TensorDict:
         ret_obs = super()._get_filtered_obs(observations, cur_batch_idx)
 
         if NavGoalPointGoalSensor.cls_uuid in ret_obs:
-            for i in cur_batch_idx:
-                if self._cur_skill_args[cur_batch_idx[i]].is_target:
+            for i, batch_i in enumerate(cur_batch_idx):
+                if self._cur_skill_args[batch_i].is_target:
                     replace_sensor = TargetGoalGpsCompassSensor.cls_uuid
                 else:
                     replace_sensor = TargetStartGpsCompassSensor.cls_uuid
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
index 5c1ab5e23a..8e34798f2a 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
@@ -24,6 +24,7 @@ def __init__(
         self._target = np.array([float(x) for x in config.reset_joint_state])
 
         self._arm_ac_range = find_action_range(action_space, "arm_action")
+        self._arm_ac_range = (self._arm_ac_range[0], self._target.shape[0])
 
     def on_enter(
         self,
@@ -84,7 +85,7 @@ def _internal_act(
         )
 
         action = torch.zeros_like(prev_actions)
-
+        # There is an extra grab action that we don't want to set.
         action[
             ..., self._arm_ac_range[0] : self._arm_ac_range[1]
         ] = torch.from_numpy(delta).to(
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index fff04bf3d9..8252e87671 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -301,7 +301,9 @@ def _is_skill_done(
         self, observations, rnn_hidden_states, prev_actions, masks, batch_idx
     ) -> torch.BoolTensor:
         """
-        :returns: A (batch_size,) size tensor where 1 indicates the skill wants to end and 0 if not.
+        :returns: A (batch_size,) size tensor where 1 indicates the skill wants
+            to end and 0 if not where batch_size is potentially a subset of the
+            overall num_environments as specified by `batch_idx`.
         """
         return torch.zeros(observations.shape[0], dtype=torch.bool).to(
             masks.device
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py
index caab4194ee..d5500d3185 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py
@@ -29,7 +29,7 @@ def _is_skill_done(
         self, observations, rnn_hidden_states, prev_actions, masks, batch_idx
     ) -> torch.BoolTensor:
         assert self._wait_time > 0
-        return self._cur_skill_step >= self._wait_time
+        return (self._cur_skill_step >= self._wait_time)[batch_idx]
 
     def _internal_act(
         self,

From 11659b4be8c0d53ebd5799e0a0afe3799dc2c0ee Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Fri, 20 Jan 2023 10:31:36 -0800
Subject: [PATCH 10/48] Fixed formatting

---
 habitat-baselines/habitat_baselines/README.md |  2 +-
 .../rl/policy/monolithic.yaml                 |  2 +-
 .../rl/hrl/hierarchical_policy.py             |  2 +-
 .../rl/hrl/hl/high_level_policy.py            |  4 ++--
 .../rl/hrl/hl/neural_policy.py                |  6 +++---
 .../rl/hrl/hrl_rollout_storage.py             |  4 ++--
 .../habitat_baselines/rl/ppo/policy.py        | 19 +++++--------------
 .../rl/ver/ver_rollout_storage.py             |  4 ----
 .../habitat_baselines/rl/ver/ver_trainer.py   | 17 +----------------
 9 files changed, 16 insertions(+), 44 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/README.md b/habitat-baselines/habitat_baselines/README.md
index a37992654c..bb0319da10 100644
--- a/habitat-baselines/habitat_baselines/README.md
+++ b/habitat-baselines/habitat_baselines/README.md
@@ -57,7 +57,7 @@ Change the `/benchmark/nav/pointnav: pointnav_gibson` in `habitat_baselines/conf
 
 We provide a two-layer hierarchical policy class, consisting of a low-level skill that moves the robot, and a high-level policy that reasons about which low-level skill to use in the current state. This can be especially powerful in long-horizon mobile manipulation tasks, like those introduced in [Habitat2.0](https://arxiv.org/abs/2106.14405). Both the low- and high- level can be either learned or an oracle. For oracle high-level we use [PDDL](https://planning.wiki/guide/whatis/pddl), and for oracle low-level we use instantaneous transitions, with the environment set to the final desired state. Additionally, for navigation, we provide an oracle navigation skill that uses A-star and the map of the environment to move the robot to its goal.
 
-To run the following examples, you need the [ReplicaCAD dataset](https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md#replicacad). 
+To run the following examples, you need the [ReplicaCAD dataset](https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md#replicacad).
 
 To train a high-level policy, while using pre-learned low-level skills (SRL baseline from [Habitat2.0](https://arxiv.org/abs/2106.14405)), you can run:
 
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/monolithic.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/monolithic.yaml
index 9a1850d158..83f65a6e5f 100644
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/monolithic.yaml
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/monolithic.yaml
@@ -1,4 +1,4 @@
 name: "PointNavResNetPolicy"
 action_distribution_type: "gaussian"
 action_dist:
-  use_log_std: True
\ No newline at end of file
+  use_log_std: True
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
index 5e6da4ea32..7ecf3ab7ff 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
@@ -361,7 +361,7 @@ def get_value(self, observations, rnn_hidden_states, prev_actions, masks):
             observations, rnn_hidden_states, prev_actions, masks
         )
 
-    def _get_policy_components(self) -> Tuple[nn.Module]:
+    def _get_policy_components(self) -> List[nn.Module]:
         return self._high_level_policy.get_policy_components()
 
     def evaluate_actions(
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
index 6d1eb7bf50..c919abbf51 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
@@ -93,8 +93,8 @@ def get_next_skill(
     def apply_mask(self, mask: torch.Tensor) -> None:
         pass
 
-    def get_policy_components(self) -> Tuple[nn.Module]:
-        return tuple()
+    def get_policy_components(self) -> List[nn.Module]:
+        return []
 
     def get_termination(
         self,
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
index f8796f08a3..3c6def42d2 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
@@ -1,5 +1,5 @@
 from itertools import chain
-from typing import Any, List, Tuple
+from typing import Any, List
 
 import gym.spaces as spaces
 import numpy as np
@@ -110,8 +110,8 @@ def parameters(self):
             self._critic.parameters(),
         )
 
-    def get_policy_components(self) -> Tuple[nn.Module]:
-        return (self,)
+    def get_policy_components(self) -> List[nn.Module]:
+        return [self]
 
     def forward(self, obs, rnn_hidden_states, masks, rnn_build_seq_info=None):
         hidden = []
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
index 6d8491e592..2ee96eb16a 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
@@ -10,7 +10,7 @@
 
 from habitat_baselines.common.baseline_registry import baseline_registry
 from habitat_baselines.common.rollout_storage import RolloutStorage
-from habitat_baselines.common.tensor_dict import TensorDict
+from habitat_baselines.common.tensor_dict import DictTree
 from habitat_baselines.rl.models.rnn_state_encoder import (
     build_pack_info_from_dones,
     build_rnn_build_seq_info,
@@ -142,7 +142,7 @@ def compute_returns(self, next_value, use_gae, gamma, tau):
 
     def recurrent_generator(
         self, advantages, num_batches
-    ) -> Iterator[TensorDict]:
+    ) -> Iterator[DictTree]:
         num_environments = advantages.size(1)
         dones_cpu = (
             torch.logical_not(self.buffers["masks"])
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/policy.py b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
index 07da61345f..2ec25b68c8 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/policy.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
@@ -5,16 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 import abc
 from dataclasses import dataclass
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Tuple,
-    Union,
-)
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
 
 import torch
 from gym import spaces
@@ -87,8 +78,8 @@ def get_policy_action_space(
     ) -> spaces.Space:
         return env_action_space
 
-    def _get_policy_components(self) -> Tuple[nn.Module]:
-        return tuple()
+    def _get_policy_components(self) -> List[nn.Module]:
+        return []
 
     def aux_loss_parameters(self) -> Dict[str, Iterable[torch.Tensor]]:
         return {}
@@ -268,8 +259,8 @@ def evaluate_actions(
             aux_loss_res,
         )
 
-    def _get_policy_components(self) -> Tuple[nn.Module]:
-        return (self.net, self.critic, self.action_distribution)
+    def _get_policy_components(self) -> List[nn.Module]:
+        return [self.net, self.critic, self.action_distribution]
 
     def aux_loss_parameters(self) -> Dict[str, Iterable[torch.Tensor]]:
         return {k: v.parameters() for k, v in self.aux_loss_modules.items()}
diff --git a/habitat-baselines/habitat_baselines/rl/ver/ver_rollout_storage.py b/habitat-baselines/habitat_baselines/rl/ver/ver_rollout_storage.py
index 361ee96ad4..d9f6c70af0 100644
--- a/habitat-baselines/habitat_baselines/rl/ver/ver_rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/rl/ver/ver_rollout_storage.py
@@ -144,9 +144,7 @@ def __init__(
         action_space,
         recurrent_hidden_state_size,
         num_recurrent_layers=1,
-        action_shape: Optional[Tuple[int]] = None,
         is_double_buffered: bool = False,
-        discrete_actions: bool = True,
     ):
         super().__init__(
             numsteps,
@@ -155,9 +153,7 @@ def __init__(
             action_space,
             recurrent_hidden_state_size,
             num_recurrent_layers,
-            action_shape,
             is_double_buffered,
-            discrete_actions,
         )
         self.use_is_coeffs = variable_experience
 
diff --git a/habitat-baselines/habitat_baselines/rl/ver/ver_trainer.py b/habitat-baselines/habitat_baselines/rl/ver/ver_trainer.py
index d4241e77cc..504bdf71d5 100644
--- a/habitat-baselines/habitat_baselines/rl/ver/ver_trainer.py
+++ b/habitat-baselines/habitat_baselines/rl/ver/ver_trainer.py
@@ -52,12 +52,7 @@
     WorkerBase,
     WorkerQueues,
 )
-from habitat_baselines.utils.common import (
-    cosine_decay,
-    get_num_actions,
-    inference_mode,
-    is_continuous_action_space,
-)
+from habitat_baselines.utils.common import cosine_decay, inference_mode
 
 try:
     torch.backends.cudnn.allow_tf32 = True
@@ -196,14 +191,6 @@ def _init_train(self, resume_state):
             )
             for ew in self.environment_workers
         ]
-        if is_continuous_action_space(action_space):
-            # Assume ALL actions are NOT discrete
-            action_shape = (get_num_actions(action_space),)
-            discrete_actions = False
-        else:
-            # For discrete pointnav
-            action_shape = (1,)
-            discrete_actions = True
 
         ppo_cfg = self.config.habitat_baselines.rl.ppo
         if torch.cuda.is_available():
@@ -253,8 +240,6 @@ def _init_train(self, resume_state):
                 action_space=self.policy_action_space,
                 recurrent_hidden_state_size=ppo_cfg.hidden_size,
                 num_recurrent_layers=self.actor_critic.net.num_recurrent_layers,
-                action_shape=action_shape,
-                discrete_actions=discrete_actions,
                 observation_space=rollouts_obs_space,
             )
             self.rollouts = VERRolloutStorage(**storage_kwargs)

From 633ebf3b128c493ad9e4429e51e3cb1257863b1c Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Fri, 27 Jan 2023 08:47:51 -0800
Subject: [PATCH 11/48] Fixed device issues. Cleaned up configs.

---
 .../config/default_structured_configs.py      |  4 +-
 .../defined_skills/nn_skills.yaml             |  5 +-
 .../{noop_skills.yaml => oracle_skills.yaml}  |  5 +-
 .../habitat_baselines/rl/policy/hl_fixed.yaml | 11 ---
 .../rl/policy/hl_neural.yaml                  | 11 ---
 .../rl/policy/oracle_nav.yaml                 | 39 +++++++++
 .../config/rearrange/rl_hierarchical.yaml     |  1 +
 .../rearrange/rl_hierarchical_oracle_nav.yaml | 81 +++----------------
 .../rl/hrl/hierarchical_policy.py             | 21 +++--
 .../habitat_baselines/rl/hrl/skills/reset.py  |  1 -
 .../habitat_baselines/rl/hrl/skills/skill.py  | 10 ++-
 test/test_baseline_training.py                |  4 +-
 12 files changed, 78 insertions(+), 115 deletions(-)
 rename habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/{noop_skills.yaml => oracle_skills.yaml} (94%)
 create mode 100644 habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/oracle_nav.yaml

diff --git a/habitat-baselines/habitat_baselines/config/default_structured_configs.py b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
index c61405bf1e..8190f2d116 100644
--- a/habitat-baselines/habitat_baselines/config/default_structured_configs.py
+++ b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 from hydra.core.config_store import ConfigStore
 from omegaconf import MISSING
@@ -242,13 +242,13 @@ class HrlDefinedSkill(HabitatBaselinesBaseConfig):
     stop_thresh: float = 0.001
     # For the reset_arm_skill
     reset_joint_state: List[float] = MISSING
+    pddl_action_names: Optional[List[str]] = None
 
 
 @dataclass
 class HierarchicalPolicy(HabitatBaselinesBaseConfig):
     high_level_policy: Dict[str, Any] = MISSING
     defined_skills: Dict[str, HrlDefinedSkill] = field(default_factory=dict)
-    use_skills: Dict[str, str] = field(default_factory=dict)
 
 
 @dataclass
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/nn_skills.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/nn_skills.yaml
index 24d7cbca38..a0370b63cf 100644
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/nn_skills.yaml
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/nn_skills.yaml
@@ -24,7 +24,7 @@ place:
   obs_skill_inputs: ["obj_goal_sensor"]
   load_ckpt_file: "data/models/place.pth"
 
-wait_skill:
+wait:
   skill_name: "WaitSkillPolicy"
   max_skill_steps: -1
   force_end_on_timeout: False
@@ -35,8 +35,9 @@ nav_to_obj:
   load_ckpt_file: "data/models/nav.pth"
   max_skill_steps: 300
   obs_skill_input_dim: 2
+  pddl_action_names: ["nav", "nav_to_receptacle"]
 
-reset_arm_skill:
+reset_arm:
   skill_name: "ResetArmSkill"
   max_skill_steps: 50
   reset_joint_state: [-4.50e-01, -1.08e00, 9.95e-02, 9.38e-01, -7.88e-04, 1.57e00, 4.62e-03]
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/noop_skills.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
similarity index 94%
rename from habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/noop_skills.yaml
rename to habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
index c8b1a34e29..d3f9c34723 100644
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/noop_skills.yaml
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
@@ -33,7 +33,7 @@ place:
   apply_postconds: True
   force_end_on_timeout: False
 
-wait_skill:
+wait:
   skill_name: "WaitSkillPolicy"
   max_skill_steps: -1
 
@@ -44,8 +44,9 @@ nav_to_obj:
   apply_postconds: True
   force_end_on_timeout: False
   obs_skill_input_dim: 2
+  pddl_action_names: ["nav", "nav_to_receptacle"]
 
-reset_arm_skill:
+reset_arm:
   skill_name: "ResetArmSkill"
   max_skill_steps: 50
   reset_joint_state: [-4.50e-01, -1.07e00, 9.95e-02, 9.38e-01, -7.88e-04, 1.57e00, 4.62e-03]
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_fixed.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_fixed.yaml
index 0aef449dbe..018fd32519 100644
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_fixed.yaml
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_fixed.yaml
@@ -7,15 +7,4 @@ hierarchical_policy:
   high_level_policy:
     name: "FixedHighLevelPolicy"
     add_arm_rest: True
-  use_skills:
-    open_cab: "open_cab"
-    open_fridge: "open_fridge"
-    close_cab: "close_cab"
-    close_fridge: "close_fridge"
-    pick: "pick"
-    place: "place"
-    nav: "nav_to_obj"
-    nav_to_receptacle: "nav_to_obj"
-    wait: "wait_skill"
-    reset_arm: "reset_arm_skill"
   defined_skills: {}
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
index f15a1eb389..2e7d9a4c87 100644
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
@@ -15,15 +15,4 @@ hierarchical_policy:
     num_rnn_layers: 2
     policy_input_keys:
       - "robot_head_depth"
-  use_skills:
-    open_cab: "open_cab"
-    open_fridge: "open_fridge"
-    close_cab: "open_cab"
-    close_fridge: "open_fridge"
-    pick: "pick"
-    place: "place"
-    nav: "nav_to_obj"
-    nav_to_receptacle: "nav_to_obj"
-    wait: "wait_skill"
-    reset_arm: "reset_arm_skill"
   defined_skills: {}
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/oracle_nav.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/oracle_nav.yaml
new file mode 100644
index 0000000000..8aa067b89c
--- /dev/null
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/oracle_nav.yaml
@@ -0,0 +1,39 @@
+# @package _global_
+defaults:
+  - /habitat/task/actions:
+    - pddl_apply_action
+    - oracle_nav_action
+    - arm_action
+    - base_velocity
+    - rearrange_stop
+habitat:
+  gym:
+    obs_keys:
+      - robot_head_depth
+      - relative_resting_position
+      - obj_start_sensor
+      - obj_goal_sensor
+      - obj_start_gps_compass
+      - obj_goal_gps_compass
+      - joint
+      - is_holding
+      - ee_pos
+      - localization_sensor
+  simulator:
+    habitat_sim_v0:
+      allow_sliding: True
+
+
+habitat_baselines:
+  rl:
+    policy:
+      hierarchical_policy:
+        # Override to use the oracle navigation skill.
+        use_skills:
+          nav: "oracle_nav"
+          nav_to_receptacle: "oracle_nav"
+        defined_skills:
+          oracle_nav:
+            skill_name: "OracleNavPolicy"
+            obs_skill_inputs: ["obj_start_sensor", "abs_obj_start_sensor", "obj_goal_sensor", "abs_obj_goal_sensor"]
+            max_skill_steps: 300
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
index df1f0ee015..387b8398f6 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
@@ -3,6 +3,7 @@
 defaults:
   - /benchmark/rearrange: rearrange_easy
   - /habitat_baselines: habitat_baselines_rl_config_base
+  - /habitat/simulator/sim_sensors@habitat_baselines.eval.extra_sim_sensors.third_rgb_sensor: third_rgb_sensor
   - /habitat_baselines/rl/policy/obs_transforms:
     - add_virtual_keys_base
   - /habitat_baselines/rl/policy: hl_fixed
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml
index 3152f00641..bb3e76924e 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml
@@ -1,18 +1,15 @@
 # @package _global_
 
+# Extends the `rl_hierarchical` config to use an oracle navigation action.
+# Several things change to support the oracle navigation:
+# - Adding the oracle navigation action
+# -
+
 defaults:
-  - /benchmark/rearrange: rearrange_easy
-  - /habitat_baselines: habitat_baselines_rl_config_base
-  - /habitat_baselines/rl/policy/obs_transforms:
-    - add_virtual_keys_base
-  - /habitat_baselines/rl/policy: hl_neural
-  - /habitat_baselines/rl/policy/hierarchical_policy/defined_skills: noop_skills
+  - rl_hierarchical
   - /habitat/task/actions:
     - pddl_apply_action
     - oracle_nav_action
-    - arm_action
-    - base_velocity
-    - rearrange_stop
   - _self_
 
 habitat:
@@ -28,74 +25,18 @@ habitat:
       - is_holding
       - ee_pos
       - localization_sensor
-
+  simulator:
+    habitat_sim_v0:
+      allow_sliding: True
 
 habitat_baselines:
-  verbose: False
-  trainer_name: "ddppo"
-  torch_gpu_id: 0
-  rollout_storage: "HrlRolloutStorage"
-  updater_name: "HrlPPO"
-  distrib_updater_name: "HrlDDPPO"
-  video_fps: 30
-  eval_ckpt_path_dir: ""
-  num_environments: 3
-  writer_type: 'tb'
-  checkpoint_folder: "data/new_checkpoints"
-  num_updates: -1
-  total_num_steps: 1.0e8
-  log_interval: 10
-  num_checkpoints: 20
-  force_torch_single_threaded: True
-  eval_keys_to_include_in_name: ['reward', 'force', 'composite_success']
-  load_resume_state_config: False
-
-  eval:
-    use_ckpt_config: False
-    should_load_ckpt: False
-    video_option: ["disk"]
 
   rl:
     policy:
       hierarchical_policy:
-        use_skills:
-          nav: "oracle_nav"
-          nav_to_receptacle: "oracle_nav"
+        # Override to use the oracle navigation skill.
         defined_skills:
-          oracle_nav:
+          nav_to_obj:
             skill_name: "OracleNavPolicy"
             obs_skill_inputs: ["obj_start_sensor", "abs_obj_start_sensor", "obj_goal_sensor", "abs_obj_goal_sensor"]
             max_skill_steps: 300
-    ppo:
-      # ppo params
-      clip_param: 0.2
-      ppo_epoch: 2
-      num_mini_batch: 2
-      value_loss_coef: 0.5
-      entropy_coef: 0.0001
-      lr: 2.5e-4
-      eps: 1e-5
-      max_grad_norm: 0.2
-      num_steps: 128
-      use_gae: True
-      gamma: 0.99
-      tau: 0.95
-
-    ddppo:
-      sync_frac: 0.6
-      # The PyTorch distributed backend to use
-      distrib_backend: NCCL
-      # Visual encoder backbone
-      pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth
-      # Initialize with pretrained weights
-      pretrained: False
-      # Initialize just the visual encoder backbone with pretrained weights
-      pretrained_encoder: False
-      # Whether the visual encoder backbone will be trained.
-      train_encoder: True
-      # Whether to reset the critic linear layer
-      reset_critic: False
-      # Model parameters
-      backbone: resnet18
-      rnn_type: LSTM
-      num_recurrent_layers: 2
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
index 7ecf3ab7ff..9078e4b894 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
@@ -70,16 +70,9 @@ def __init__(
             config,
         )
 
-        for i, (skill_id, use_skill_name) in enumerate(
-            config.hierarchical_policy.use_skills.items()
+        for i, (skill_name, skill_config) in enumerate(
+            config.hierarchical_policy.defined_skills.items()
         ):
-            if use_skill_name == "":
-                # Skip loading this skill if no name is provided
-                continue
-            skill_config = config.hierarchical_policy.defined_skills[
-                use_skill_name
-            ]
-
             cls = eval(skill_config.skill_name)
             skill_policy = cls.from_config(
                 skill_config,
@@ -90,8 +83,13 @@ def __init__(
             )
             skill_policy.set_pddl_problem(self._pddl_problem)
             self._skills[i] = skill_policy
-            self._name_to_idx[skill_id] = i
-            self._idx_to_name[i] = skill_id
+            if skill_config.pddl_action_names is None:
+                action_names = [skill_name]
+            else:
+                action_names = skill_config.pddl_action_names
+            for skill_id in action_names:
+                self._name_to_idx[skill_id] = i
+                self._idx_to_name[i] = skill_id
 
         self._cur_skills: torch.Tensor = torch.full(
             (self._num_envs,), -1, dtype=torch.long
@@ -248,6 +246,7 @@ def act(
                 continue
             # TODO: either change name of the function or assign actions somewhere
             # else. Updating actions in should_terminate is counterintuitive
+
             (
                 call_high_level[batch_ids],
                 bad_should_terminate[batch_ids],
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
index 8e34798f2a..efe5531075 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
@@ -59,7 +59,6 @@ def _is_skill_done(
         return (
             torch.as_tensor(
                 np.abs(current_joint_pos - self._target).max(-1),
-                device=rnn_hidden_states.device,
                 dtype=torch.float32,
             )
             < 5e-2
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index 8252e87671..dfe0a75738 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -158,11 +158,15 @@ def should_terminate(
         log_info,
     ) -> Tuple[torch.BoolTensor, torch.BoolTensor, torch.Tensor]:
         """
-        :returns: A (batch_size,) size tensor where 1 indicates the skill wants to end and 0 if not.
+        :returns: Both of the BoolTensor's will be on the CPU.
+            - `is_skill_done`: Shape (batch_size,) size tensor where 1
+              indicates the skill to return control to HL policy.
+            - `bad_terminate`: Shape (batch_size,) size tensor where 1
+              indicates the skill should immediately end the episode.
         """
         is_skill_done = self._is_skill_done(
             observations, rnn_hidden_states, prev_actions, masks, batch_idx
-        )
+        ).cpu()
         if is_skill_done.sum() > 0:
             self._internal_log(
                 f"Requested skill termination {is_skill_done}",
@@ -274,7 +278,7 @@ def act(
         return action_data
 
     def to(self, device):
-        self._cur_skill_step = self._cur_skill_step.to(device)
+        pass
 
     def _select_obs(self, obs, cur_batch_idx):
         """
diff --git a/test/test_baseline_training.py b/test/test_baseline_training.py
index 52f0242b01..d377990317 100644
--- a/test/test_baseline_training.py
+++ b/test/test_baseline_training.py
@@ -164,7 +164,7 @@ def test_trainers(config_path, num_updates, overrides, trainer_name):
             ],
             [
                 "nn_skills",
-                "noop_skills",
+                "oracle_skills",
             ],
             [
                 "eval",
@@ -178,7 +178,7 @@ def test_hrl(config_path, policy_type, skill_type, mode):
         return
     if policy_type == "hl_fixed" and mode == "train":
         return
-    if skill_type == "noop_skills" and "oracle" not in config_path:
+    if skill_type == "oracle_skills" and "oracle" not in config_path:
         return
     # Remove the checkpoints from previous tests
     for f in glob.glob("data/test_checkpoints/test_training/*"):

From 1c82390e4f02973cd0d4356dc9542c087f7e5d6d Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Fri, 27 Jan 2023 08:59:32 -0800
Subject: [PATCH 12/48] More config cleanup

---
 .../rl/policy/oracle_nav.yaml                 | 39 -------------------
 .../rearrange/rl_hierarchical_oracle_nav.yaml |  4 +-
 .../rl/hrl/hierarchical_policy.py             | 15 ++++---
 3 files changed, 11 insertions(+), 47 deletions(-)
 delete mode 100644 habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/oracle_nav.yaml

diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/oracle_nav.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/oracle_nav.yaml
deleted file mode 100644
index 8aa067b89c..0000000000
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/oracle_nav.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# @package _global_
-defaults:
-  - /habitat/task/actions:
-    - pddl_apply_action
-    - oracle_nav_action
-    - arm_action
-    - base_velocity
-    - rearrange_stop
-habitat:
-  gym:
-    obs_keys:
-      - robot_head_depth
-      - relative_resting_position
-      - obj_start_sensor
-      - obj_goal_sensor
-      - obj_start_gps_compass
-      - obj_goal_gps_compass
-      - joint
-      - is_holding
-      - ee_pos
-      - localization_sensor
-  simulator:
-    habitat_sim_v0:
-      allow_sliding: True
-
-
-habitat_baselines:
-  rl:
-    policy:
-      hierarchical_policy:
-        # Override to use the oracle navigation skill.
-        use_skills:
-          nav: "oracle_nav"
-          nav_to_receptacle: "oracle_nav"
-        defined_skills:
-          oracle_nav:
-            skill_name: "OracleNavPolicy"
-            obs_skill_inputs: ["obj_start_sensor", "abs_obj_start_sensor", "obj_goal_sensor", "abs_obj_goal_sensor"]
-            max_skill_steps: 300
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml
index bb3e76924e..92b80bea64 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml
@@ -3,7 +3,8 @@
 # Extends the `rl_hierarchical` config to use an oracle navigation action.
 # Several things change to support the oracle navigation:
 # - Adding the oracle navigation action
-# -
+# - Add sliding.
+# - Use the oracle navigation skill.
 
 defaults:
   - rl_hierarchical
@@ -30,7 +31,6 @@ habitat:
       allow_sliding: True
 
 habitat_baselines:
-
   rl:
     policy:
       hierarchical_policy:
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
index 9078e4b894..7376fc822a 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
@@ -70,9 +70,11 @@ def __init__(
             config,
         )
 
-        for i, (skill_name, skill_config) in enumerate(
-            config.hierarchical_policy.defined_skills.items()
-        ):
+        skill_i = 0
+        for (
+            skill_name,
+            skill_config,
+        ) in config.hierarchical_policy.defined_skills.items():
             cls = eval(skill_config.skill_name)
             skill_policy = cls.from_config(
                 skill_config,
@@ -82,14 +84,15 @@ def __init__(
                 full_config,
             )
             skill_policy.set_pddl_problem(self._pddl_problem)
-            self._skills[i] = skill_policy
             if skill_config.pddl_action_names is None:
                 action_names = [skill_name]
             else:
                 action_names = skill_config.pddl_action_names
             for skill_id in action_names:
-                self._name_to_idx[skill_id] = i
-                self._idx_to_name[i] = skill_id
+                self._name_to_idx[skill_id] = skill_i
+                self._idx_to_name[skill_i] = skill_id
+                self._skills[skill_i] = skill_policy
+                skill_i += 1
 
         self._cur_skills: torch.Tensor = torch.full(
             (self._num_envs,), -1, dtype=torch.long

From 2e973f5c525cee5cb339d3596e5254109ae8bb1c Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Fri, 27 Jan 2023 10:13:43 -0800
Subject: [PATCH 13/48] Addressing PR comments

---
 .../common/baseline_registry.py               |  5 ++-
 .../config/default_structured_configs.py      | 10 +++---
 .../config/rearrange/rl_hierarchical.yaml     |  3 ++
 .../habitat_baselines/rl/hrl/__init__.py      |  2 +-
 .../rl/hrl/hierarchical_policy.py             | 15 ++++++--
 .../rl/hrl/hl/fixed_policy.py                 |  6 ++--
 .../rl/hrl/hl/high_level_policy.py            |  3 +-
 .../habitat_baselines/rl/hrl/hrl_ppo.py       |  4 +--
 .../rl/hrl/hrl_rollout_storage.py             | 35 +++++++++++++++----
 .../habitat_baselines/rl/hrl/skills/skill.py  |  6 ++--
 .../config/default_structured_configs.py      |  3 --
 11 files changed, 65 insertions(+), 27 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/common/baseline_registry.py b/habitat-baselines/habitat_baselines/common/baseline_registry.py
index 7ff89a28b6..864833b0a2 100644
--- a/habitat-baselines/habitat_baselines/common/baseline_registry.py
+++ b/habitat-baselines/habitat_baselines/common/baseline_registry.py
@@ -23,6 +23,7 @@
 from typing import Optional
 
 from habitat.core.registry import Registry
+from habitat_baselines.common.rollout_storage import RolloutStorage
 
 
 class BaselineRegistry(Registry):
@@ -138,7 +139,9 @@ def get_auxiliary_loss(cls, name: str):
 
     @classmethod
     def register_storage(cls, to_register=None, *, name: Optional[str] = None):
-        return cls._register_impl("storage", to_register, name)
+        return cls._register_impl(
+            "storage", to_register, name, assert_type=RolloutStorage
+        )
 
     @classmethod
     def get_storage(cls, name: str):
diff --git a/habitat-baselines/habitat_baselines/config/default_structured_configs.py b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
index 8190f2d116..062340b64c 100644
--- a/habitat-baselines/habitat_baselines/config/default_structured_configs.py
+++ b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
@@ -224,7 +224,7 @@ class Eq2CubeConfig(ObsTransformConfig):
 
 
 @dataclass
-class HrlDefinedSkill(HabitatBaselinesBaseConfig):
+class HrlDefinedSkillConfig(HabitatBaselinesBaseConfig):
     skill_name: str = MISSING
     name: str = "PointNavResNetPolicy"
     action_distribution_type: str = "gaussian"
@@ -246,9 +246,11 @@ class HrlDefinedSkill(HabitatBaselinesBaseConfig):
 
 
 @dataclass
-class HierarchicalPolicy(HabitatBaselinesBaseConfig):
+class HierarchicalPolicyConfig(HabitatBaselinesBaseConfig):
     high_level_policy: Dict[str, Any] = MISSING
-    defined_skills: Dict[str, HrlDefinedSkill] = field(default_factory=dict)
+    defined_skills: Dict[str, HrlDefinedSkillConfig] = field(
+        default_factory=dict
+    )
 
 
 @dataclass
@@ -259,7 +261,7 @@ class PolicyConfig(HabitatBaselinesBaseConfig):
     # For gaussian action distribution:
     action_dist: ActionDistributionConfig = ActionDistributionConfig()
     obs_transforms: Dict[str, ObsTransformConfig] = field(default_factory=dict)
-    hierarchical_policy: HierarchicalPolicy = MISSING
+    hierarchical_policy: HierarchicalPolicyConfig = MISSING
 
 
 @dataclass
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
index 387b8398f6..70d35f7702 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
@@ -1,5 +1,8 @@
 # @package _global_
 
+# Config for running hierarchical policies where a high-level (HL) policy selects from a set of low-level (LL) policies.
+# Supports different HL policy configurations and using a variety of LL policies.
+
 defaults:
   - /benchmark/rearrange: rearrange_easy
   - /habitat_baselines: habitat_baselines_rl_config_base
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/__init__.py b/habitat-baselines/habitat_baselines/rl/hrl/__init__.py
index 75f7fadf5c..394540d026 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/__init__.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/__init__.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from habitat_baselines.rl.hrl.hrl_ppo import HrlPPO
+from habitat_baselines.rl.hrl.hrl_ppo import HRLPPO
 from habitat_baselines.rl.hrl.hrl_rollout_storage import HrlRolloutStorage
 
 __all__ = [
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
index 7376fc822a..5c768aba17 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
@@ -40,6 +40,15 @@
 
 @baseline_registry.register_policy
 class HierarchicalPolicy(nn.Module, Policy):
+    """
+    :property _pddl_problem: Stores the PDDL domain information. This allows
+        accessing all the possible entities, actions, and predicates. Note that
+        this is not the grounded PDDL problem with truth values assigned to the
+        predicates basedon the current simulator state.
+    """
+
+    _pddl_problem: PddlProblem
+
     def __init__(
         self,
         config,
@@ -213,7 +222,7 @@ def act(
             (self._num_envs,), dtype=torch.bool
         )
 
-        hl_says_term = self._high_level_policy.get_termination(
+        hl_policy_wants_termination = self._high_level_policy.get_termination(
             observations,
             rnn_hidden_states,
             prev_actions,
@@ -221,7 +230,7 @@ def act(
             self._cur_skills,
             log_info,
         )
-        # Compute the actions from the current skills
+        # Initialize empty action set based on the overall action space.
         actions = torch.zeros(
             (self._num_envs, get_num_actions(self._action_space)),
             device=masks.device,
@@ -235,7 +244,7 @@ def act(
                 "prev_actions": prev_actions,
                 "masks": masks,
                 "actions": actions,
-                "hl_says_term": hl_says_term,
+                "hl_policy_wants_termination": hl_policy_wants_termination,
             },
             # Only decide on skill termination if the episode is active.
             should_adds=masks,
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/fixed_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/fixed_policy.py
index 8b6e6a6552..fa6b09f542 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/fixed_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/fixed_policy.py
@@ -32,10 +32,10 @@ def __init__(self, *args, **kwargs):
 
     def _parse_solution_actions(self, solution):
         solution_actions = []
-        for i, sol_step in enumerate(solution):
+        for i, hl_action in enumerate(solution):
             sol_action = (
-                sol_step.name,
-                [x.name for x in sol_step.param_values],
+                hl_action.name,
+                [x.name for x in hl_action.param_values],
             )
             solution_actions.append(sol_action)
 
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
index c919abbf51..d4dd946415 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
@@ -107,9 +107,10 @@ def get_termination(
     ) -> torch.BoolTensor:
         """
         Can force the currently executing skill to terminate.
+        In the base HighLevelPolicy, the skill always continues.
 
         Returns: A binary tensor where 1 indicates the current skill should
-        terminate and 0 indicates the skill can continue.
+            terminate and 0 indicates the skill can continue.
         """
 
         return torch.zeros(self._num_envs, dtype=torch.bool)
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
index a0cb2222a7..31e742de5c 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
@@ -11,7 +11,7 @@
 
 
 @baseline_registry.register_updater
-class HrlPPO(PPO):
+class HRLPPO(PPO):
     def _update_from_batch(self, batch, epoch, rollouts, learner_metrics):
         n_samples = max(batch["loss_mask"].sum(), 1)
 
@@ -112,5 +112,5 @@ def reduce_loss(loss):
 
 
 @baseline_registry.register_updater
-class HrlDDPPO(DecentralizedDistributedMixin, HrlPPO):
+class HrlDDPPO(DecentralizedDistributedMixin, HRLPPO):
     pass
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
index 2ee96eb16a..8cc1730e51 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Iterator
+from typing import Iterator, Optional
 
 import torch
 
@@ -21,15 +21,22 @@
 
 @baseline_registry.register_storage
 class HrlRolloutStorage(RolloutStorage):
+    """
+    Supports variable writes to the rollout buffer where data is not inserted
+    into the buffer on every step. When getting batches from the storage, these
+    batches will only contain samples that were written. This means that the
+    batches could be variable size and less than the maximum size of the
+    rollout buffer.
+    """
+
     def __init__(self, numsteps, num_envs, *args, **kwargs):
         super().__init__(numsteps, num_envs, *args, **kwargs)
         self._num_envs = num_envs
         self._cur_step_idxs = torch.zeros(self._num_envs, dtype=torch.long)
         self._last_should_inserts = None
-        if self.is_double_buffered:
-            raise ValueError(
-                "HRL storage does not support double buffered sampling"
-            )
+        assert (
+            not self.is_double_buffered
+        ), "HRL storage does not support double buffered sampling"
 
     def insert(
         self,
@@ -41,8 +48,19 @@ def insert(
         rewards=None,
         next_masks=None,
         buffer_index: int = 0,
-        should_inserts=None,
+        should_inserts: Optional[torch.BoolTensor] = None,
     ):
+        """
+        The only key different from the base `RolloutStorage` is
+        `should_inserts`. This is a bool tensor of shape [# environments,]. If
+        `should_insert[i] == True`, then this will the sample at enviroment
+        index `i` into the rollout buffer at environment index `i`, if not, it
+        will ignore the sample. If None, this defaults to the last insert
+        state.
+
+        Rewards are summed when `should_insert[i] == False`.
+        """
+
         if next_masks is not None:
             next_masks = next_masks.to(self.device)
         if rewards is not None:
@@ -143,6 +161,11 @@ def compute_returns(self, next_value, use_gae, gamma, tau):
     def recurrent_generator(
         self, advantages, num_batches
     ) -> Iterator[DictTree]:
+        """
+        Generates data batches based on the data that has been written to the
+        rollout buffer.
+        """
+
         num_environments = advantages.size(1)
         dones_cpu = (
             torch.logical_not(self.buffers["masks"])
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index dfe0a75738..0e1ac275b7 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -152,7 +152,7 @@ def should_terminate(
         prev_actions,
         masks,
         actions,
-        hl_says_term,
+        hl_policy_wants_termination,
         batch_idx: List[int],
         skill_name: List[str],
         log_info,
@@ -198,7 +198,7 @@ def should_terminate(
             elif (
                 self._config.apply_postconds
                 and is_skill_done[i] == 1.0
-                and hl_says_term[i] == 0.0
+                and hl_policy_wants_termination[i] == 0.0
             ):
                 new_actions[i] = self._apply_postcond(
                     actions, log_info, skill_name[i], env_i, i
@@ -206,7 +206,7 @@ def should_terminate(
                 self._delay_term[env_i] = True
                 is_skill_done[i] = 0.0
 
-        is_skill_done |= hl_says_term
+        is_skill_done |= hl_policy_wants_termination
 
         if bad_terminate.sum() > 0:
             self._internal_log(
diff --git a/habitat-lab/habitat/config/default_structured_configs.py b/habitat-lab/habitat/config/default_structured_configs.py
index 7ae7ab593f..69e8dbcd6b 100644
--- a/habitat-lab/habitat/config/default_structured_configs.py
+++ b/habitat-lab/habitat/config/default_structured_configs.py
@@ -149,9 +149,6 @@ class OracleNavActionConfig(ActionConfig):
     forward_velocity: float = 1.0
     turn_thresh: float = 0.1
     dist_thresh: float = 0.2
-    stop_thresh: float = 0.001
-    spawn_max_dist_to_obj: float = 2.0
-    num_spawn_attempts: int = 200
     lin_speed: float = 10.0
     ang_speed: float = 10.0
     allow_dyn_slide: bool = True

From 3a805eaa2576f9dea1f84e5209c114e95887985a Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Fri, 27 Jan 2023 10:23:55 -0800
Subject: [PATCH 14/48] Updated circular reference

---
 .../habitat_baselines/common/baseline_registry.py    | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/habitat-baselines/habitat_baselines/common/baseline_registry.py b/habitat-baselines/habitat_baselines/common/baseline_registry.py
index 864833b0a2..17b5697084 100644
--- a/habitat-baselines/habitat_baselines/common/baseline_registry.py
+++ b/habitat-baselines/habitat_baselines/common/baseline_registry.py
@@ -23,7 +23,6 @@
 from typing import Optional
 
 from habitat.core.registry import Registry
-from habitat_baselines.common.rollout_storage import RolloutStorage
 
 
 class BaselineRegistry(Registry):
@@ -139,6 +138,13 @@ def get_auxiliary_loss(cls, name: str):
 
     @classmethod
     def register_storage(cls, to_register=None, *, name: Optional[str] = None):
+        """
+        Registers data storage for storing data in the policy rollout in the
+        trainer and then for fetching data batches for the updater.
+        """
+
+        from habitat_baselines.common.rollout_storage import RolloutStorage
+
         return cls._register_impl(
             "storage", to_register, name, assert_type=RolloutStorage
         )
@@ -149,6 +155,10 @@ def get_storage(cls, name: str):
 
     @classmethod
     def register_updater(cls, to_register=None, *, name: Optional[str] = None):
+        """
+        Registers a policy updater.
+        """
+
         return cls._register_impl("updater", to_register, name)
 
     @classmethod

From 7d92072817588d97328e7ba2fda59b491be464c2 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Fri, 27 Jan 2023 10:44:13 -0800
Subject: [PATCH 15/48] Addressing PR comments

---
 .../config/default_structured_configs.py              | 11 ++++++++++-
 .../config/habitat_baselines/rl/policy/hl_neural.yaml |  2 +-
 .../habitat_baselines/rl/hrl/hierarchical_policy.py   |  9 +++++++--
 .../habitat_baselines/rl/hrl/hl/high_level_policy.py  |  2 +-
 .../habitat_baselines/rl/hrl/skills/nn_skill.py       |  4 ++--
 .../habitat_baselines/rl/hrl/skills/noop.py           |  4 ++--
 .../habitat_baselines/rl/hrl/skills/oracle_nav.py     |  6 +++---
 .../habitat_baselines/rl/hrl/skills/pick.py           |  6 ++++--
 .../habitat_baselines/rl/hrl/skills/reset.py          |  4 ++--
 .../habitat_baselines/rl/hrl/skills/skill.py          |  8 ++++----
 .../habitat_baselines/rl/hrl/skills/wait.py           |  4 ++--
 habitat-baselines/habitat_baselines/rl/hrl/utils.py   |  5 +++--
 habitat-baselines/habitat_baselines/rl/ppo/policy.py  |  8 ++++----
 13 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/config/default_structured_configs.py b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
index 062340b64c..99e4e8d781 100644
--- a/habitat-baselines/habitat_baselines/config/default_structured_configs.py
+++ b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
@@ -225,20 +225,29 @@ class Eq2CubeConfig(ObsTransformConfig):
 
 @dataclass
 class HrlDefinedSkillConfig(HabitatBaselinesBaseConfig):
+    """
+    Defines a low-level skill to be used in the hierarchical policy.
+    """
+
     skill_name: str = MISSING
     name: str = "PointNavResNetPolicy"
     action_distribution_type: str = "gaussian"
     load_ckpt_file: str = ""
     max_skill_steps: int = 200
+    # If true, the stop action will be called if the skill times out.
     force_end_on_timeout: bool = True
+    # Overrides the config file of a neural network skill rather than loading
+    # the config file from the checkpoint file.
     force_config_file: str = ""
     at_resting_threshold: float = 0.15
+    # If true, this willapply the post-conditions of the skill after it
+    # terminates.
     apply_postconds: bool = False
     obs_skill_inputs: List[str] = field(default_factory=list)
     obs_skill_input_dim: int = 3
     start_zone_radius: float = 0.3
     # For the oracle navigation skill
-    nav_action_name: str = "base_velocity"
+    action_name: str = "base_velocity"
     stop_thresh: float = 0.001
     # For the reset_arm_skill
     reset_joint_state: List[float] = MISSING
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
index 2e7d9a4c87..4246ca0e03 100644
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
@@ -15,4 +15,4 @@ hierarchical_policy:
     num_rnn_layers: 2
     policy_input_keys:
       - "robot_head_depth"
-  defined_skills: {}
+  defined_skills: MISSING
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
index 5c768aba17..b1e0973a80 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
@@ -34,7 +34,7 @@
     WaitSkillPolicy,
 )
 from habitat_baselines.rl.hrl.utils import find_action_range
-from habitat_baselines.rl.ppo.policy import Policy, PolicyAction
+from habitat_baselines.rl.ppo.policy import Policy, PolicyActionData
 from habitat_baselines.utils.common import get_num_actions
 
 
@@ -128,6 +128,11 @@ def eval(self):
     def get_policy_action_space(
         self, env_action_space: spaces.Space
     ) -> spaces.Space:
+        """
+        Fetches the policy action space for learning. If we are learning the HL
+        policy, it will return its custom action space for learning.
+        """
+
         return self._high_level_policy.get_policy_action_space(
             env_action_space
         )
@@ -360,7 +365,7 @@ def act(
         }
         action_kwargs.update(hl_info)
 
-        return PolicyAction(
+        return PolicyActionData(
             take_actions=actions,
             policy_info=log_info,
             should_inserts=call_high_level,
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
index d4dd946415..c148c62f28 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
@@ -86,7 +86,7 @@ def get_next_skill(
             - skill_args_data: Arguments for the next skill.
             - immediate_end: Binary masks indicating which environment(s) should
                 end immediately.
-            - Information for PolicyAction
+            - Information for PolicyActionData
         """
         raise NotImplementedError()
 
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py
index 3478613fb1..921e96dd2f 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py
@@ -14,7 +14,7 @@
 from habitat_baselines.common.tensor_dict import TensorDict
 from habitat_baselines.config.default import get_config
 from habitat_baselines.rl.hrl.skills.skill import SkillPolicy
-from habitat_baselines.rl.ppo.policy import PolicyAction
+from habitat_baselines.rl.ppo.policy import PolicyActionData
 from habitat_baselines.utils.common import get_num_actions
 
 
@@ -119,7 +119,7 @@ def _internal_act(
         masks,
         cur_batch_idx,
         deterministic=False,
-    ) -> PolicyAction:
+    ) -> PolicyActionData:
         filtered_obs = self._get_filtered_obs(observations, cur_batch_idx)
 
         filtered_prev_actions = prev_actions[
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/noop.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/noop.py
index 6456344cb9..271afeaa0f 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/noop.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/noop.py
@@ -4,7 +4,7 @@
 import torch
 
 from habitat_baselines.rl.hrl.skills.skill import SkillPolicy
-from habitat_baselines.rl.ppo.policy import PolicyAction
+from habitat_baselines.rl.ppo.policy import PolicyActionData
 
 
 class NoopSkillPolicy(SkillPolicy):
@@ -34,6 +34,6 @@ def _internal_act(
         full_action,
         deterministic=False,
     ):
-        return PolicyAction(
+        return PolicyActionData(
             actions=full_action, rnn_hidden_states=rnn_hidden_states
         )
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
index 2208084edc..ffcd805da9 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
@@ -17,7 +17,7 @@
 from habitat_baselines.common.logging import baselines_logger
 from habitat_baselines.rl.hrl.skills.nn_skill import NnSkillPolicy
 from habitat_baselines.rl.hrl.utils import find_action_range
-from habitat_baselines.rl.ppo.policy import PolicyAction
+from habitat_baselines.rl.ppo.policy import PolicyActionData
 
 
 class OracleNavPolicy(NnSkillPolicy):
@@ -88,7 +88,7 @@ def from_config(
         cls, config, observation_space, action_space, batch_size, full_config
     ):
         filtered_action_space = ActionSpace(
-            {config.nav_action_name: action_space[config.nav_action_name]}
+            {config.action_name: action_space[config.nav_action_name]}
         )
         baselines_logger.debug(
             f"Loaded action space {filtered_action_space} for skill {config.skill_name}"
@@ -190,6 +190,6 @@ def _internal_act(
 
         full_action[:, self._oracle_nav_ac_idx] = action_idxs
 
-        return PolicyAction(
+        return PolicyActionData(
             actions=full_action, rnn_hidden_states=rnn_hidden_states
         )
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/pick.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/pick.py
index 4efb63d415..4cc58e520f 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/pick.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/pick.py
@@ -9,7 +9,7 @@
     RelativeRestingPositionSensor,
 )
 from habitat_baselines.rl.hrl.skills.nn_skill import NnSkillPolicy
-from habitat_baselines.rl.ppo.policy import PolicyAction
+from habitat_baselines.rl.ppo.policy import PolicyActionData
 
 
 class PickSkillPolicy(NnSkillPolicy):
@@ -34,7 +34,9 @@ def _parse_skill_arg(self, skill_arg):
         self._internal_log(f"Parsing skill argument {skill_arg}")
         return int(skill_arg[0].split("|")[1])
 
-    def _mask_pick(self, action: PolicyAction, observations) -> PolicyAction:
+    def _mask_pick(
+        self, action: PolicyActionData, observations
+    ) -> PolicyActionData:
         # Mask out the release if the object is already held.
         is_holding = observations[IsHoldingSensor.cls_uuid].view(-1)
         for i in torch.nonzero(is_holding):
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
index efe5531075..39bf4ea058 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
@@ -10,7 +10,7 @@
 
 from habitat_baselines.rl.hrl.skills.skill import SkillPolicy
 from habitat_baselines.rl.hrl.utils import find_action_range
-from habitat_baselines.rl.ppo.policy import PolicyAction
+from habitat_baselines.rl.ppo.policy import PolicyActionData
 
 
 class ResetArmSkill(SkillPolicy):
@@ -91,6 +91,6 @@ def _internal_act(
             device=action.device, dtype=action.dtype
         )
 
-        return PolicyAction(
+        return PolicyActionData(
             actions=action, rnn_hidden_states=rnn_hidden_states
         )
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index 0e1ac275b7..c6d6f3fb61 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -10,7 +10,7 @@
 from habitat.tasks.rearrange.rearrange_sensors import IsHoldingSensor
 from habitat_baselines.common.logging import baselines_logger
 from habitat_baselines.rl.hrl.utils import find_action_range
-from habitat_baselines.rl.ppo.policy import Policy, PolicyAction
+from habitat_baselines.rl.ppo.policy import Policy, PolicyActionData
 from habitat_baselines.utils.common import get_num_actions
 
 
@@ -81,8 +81,8 @@ def _get_multi_sensor_index(self, batch_idx: List[int]) -> List[int]:
         return [self._cur_skill_args[i] for i in batch_idx]
 
     def _keep_holding_state(
-        self, action_data: PolicyAction, observations
-    ) -> PolicyAction:
+        self, action_data: PolicyActionData, observations
+    ) -> PolicyActionData:
         """
         Makes the action so it does not result in dropping or picking up an
         object. Used in navigation and other skills which are not supposed to
@@ -327,5 +327,5 @@ def _internal_act(
         masks,
         cur_batch_idx,
         deterministic=False,
-    ) -> PolicyAction:
+    ) -> PolicyActionData:
         raise NotImplementedError()
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py
index d5500d3185..6a0887ba96 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/wait.py
@@ -8,7 +8,7 @@
 import torch
 
 from habitat_baselines.rl.hrl.skills.skill import SkillPolicy
-from habitat_baselines.rl.ppo.policy import PolicyAction
+from habitat_baselines.rl.ppo.policy import PolicyActionData
 
 
 class WaitSkillPolicy(SkillPolicy):
@@ -43,6 +43,6 @@ def _internal_act(
         action = torch.zeros(
             (masks.shape[0], self._full_ac_size), device=prev_actions.device
         )
-        return PolicyAction(
+        return PolicyActionData(
             actions=action, rnn_hidden_states=rnn_hidden_states
         )
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/utils.py b/habitat-baselines/habitat_baselines/rl/hrl/utils.py
index 0c22f3b183..d2b56d2f1e 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/utils.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/utils.py
@@ -12,7 +12,8 @@ def find_action_range(
     action_space: ActionSpace, search_key: str
 ) -> Tuple[int, int]:
     """
-    Returns the start and end indices of an action key in the action tensor.
+    Returns the start and end indices of an action key in the action tensor. If
+    the key is not found. It returns None.
     """
 
     start_idx = 0
@@ -24,5 +25,5 @@ def find_action_range(
             break
         start_idx += get_num_actions(action_space[k])
     if not found:
-        return None
+        raise ValueError(f"Could not find stop action in {action_space}")
     return start_idx, end_idx
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/policy.py b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
index 2ec25b68c8..ecfd261063 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/policy.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
@@ -32,7 +32,7 @@
 
 
 @dataclass
-class PolicyAction:
+class PolicyActionData:
     """
     Information returned from the `Policy` class.
     """
@@ -94,7 +94,7 @@ def all_policy_tensors(self) -> Iterable[torch.Tensor]:
             yield from c.buffers()
 
     def extract_policy_info(
-        self, action_data: PolicyAction, infos, dones
+        self, action_data: PolicyActionData, infos, dones
     ) -> List[Dict[str, float]]:
         """
         Gets the log information from the policy at the current time step.
@@ -112,7 +112,7 @@ def act(
         prev_actions,
         masks,
         deterministic=False,
-    ) -> PolicyAction:
+    ) -> PolicyActionData:
         raise NotImplementedError
 
     @classmethod
@@ -203,7 +203,7 @@ def act(
             action = distribution.sample()
 
         action_log_probs = distribution.log_probs(action)
-        return PolicyAction(
+        return PolicyActionData(
             values=value,
             actions=value,
             action_log_probs=action_log_probs,

From 4696551ea3d059f1f3f35d68e074ba308219415d Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Fri, 27 Jan 2023 10:47:00 -0800
Subject: [PATCH 16/48] Addressing PR comments

---
 .../habitat_baselines/rl/hrl/hrl_rollout_storage.py         | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
index 8cc1730e51..1de4f6b56d 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
@@ -118,6 +118,12 @@ def insert(
         self._last_should_inserts = should_inserts
 
     def advance_rollout(self, buffer_index: int = 0):
+        """
+        This will advance to writing at the next step in the data buffer ONLY
+        if an element was written to that environment index in the previous
+        step.
+        """
+
         self._cur_step_idxs += self._last_should_inserts.long()
 
         is_past_buffer = self._cur_step_idxs >= self.num_steps

From c11601feb9d5a9d96661fbdbd016005f59e5f999 Mon Sep 17 00:00:00 2001
From: Andrew Szot <ASzot@users.noreply.github.com>
Date: Fri, 27 Jan 2023 10:48:53 -0800
Subject: [PATCH 17/48] Update
 habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py

Co-authored-by: Vincent-Pierre BERGES <28320361+vincentpierre@users.noreply.github.com>
---
 habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index c6d6f3fb61..da8c915863 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -112,10 +112,7 @@ def _apply_postcond(
         action = self._pddl_problem.actions[skill_name]
 
         entities = [self._pddl_problem.get_entity(x) for x in skill_args]
-        if self._pddl_ac_start is None:
-            raise ValueError(
-                "Apply post cond not supported when pddl action not in action space"
-            )
+        assert self._pddl_ac_start is not None,  "Apply post cond not supported when pddl action not in action space"
 
         ac_idx = self._pddl_ac_start
         found = False

From 90155e13c085017eda20a537bd3c8a8669e6312e Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Fri, 27 Jan 2023 10:56:24 -0800
Subject: [PATCH 18/48] Addressing PR comments

---
 .../rl/hrl/hierarchical_policy.py             |  2 +-
 .../rl/hrl/skills/nn_skill.py                 |  4 +++
 .../habitat_baselines/rl/hrl/skills/skill.py  | 35 +++++++++++--------
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
index b1e0973a80..e57a2dc5ef 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
@@ -318,7 +318,7 @@ def act(
                 if "rnn_hidden_states" not in hl_info:
                     rnn_hidden_states[batch_ids] *= 0.0
                     prev_actions[batch_ids] *= 0
-                elif self._skills[skill_id].num_recurrent_layers != 0:
+                elif self._skills[skill_id].has_hidden_state:
                     raise ValueError(
                         f"The code does not currently support neural LL and neural HL skills. Skill={self._skills[skill_id]}, HL={self._high_level_policy}"
                     )
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py
index 921e96dd2f..12e172e114 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/nn_skill.py
@@ -73,6 +73,10 @@ def parameters(self):
         else:
             return []
 
+    @property
+    def has_hidden_state(self):
+        return self.num_recurrent_layers != 0
+
     @property
     def num_recurrent_layers(self):
         if self._wrap_policy is not None:
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index da8c915863..b912640359 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -2,11 +2,12 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import gym.spaces as spaces
 import torch
 
+from habitat.core.simulator import Observations
 from habitat.tasks.rearrange.rearrange_sensors import IsHoldingSensor
 from habitat_baselines.common.logging import baselines_logger
 from habitat_baselines.rl.hrl.utils import find_action_range
@@ -80,6 +81,14 @@ def _get_multi_sensor_index(self, batch_idx: List[int]) -> List[int]:
         """
         return [self._cur_skill_args[i] for i in batch_idx]
 
+    @property
+    def has_hidden_state(self):
+        """
+        Returns if the skill requires a hidden state.
+        """
+
+        return False
+
     def _keep_holding_state(
         self, action_data: PolicyActionData, observations
     ) -> PolicyActionData:
@@ -112,7 +121,9 @@ def _apply_postcond(
         action = self._pddl_problem.actions[skill_name]
 
         entities = [self._pddl_problem.get_entity(x) for x in skill_args]
-        assert self._pddl_ac_start is not None,  "Apply post cond not supported when pddl action not in action space"
+        assert (
+            self._pddl_ac_start is not None
+        ), "Apply post cond not supported when pddl action not in action space"
 
         ac_idx = self._pddl_ac_start
         found = False
@@ -130,7 +141,7 @@ def _apply_postcond(
         ]
         if len(entity_idxs) != action.n_args:
             raise ValueError(
-                f"Inconsistent # of args {action.n_args} versus {entity_idxs} for {action} with {skill_args} and {entities}"
+                f"The skill was called with the wrong # of args {action.n_args} versus {entity_idxs} for {action} with {skill_args} and {entities}. Make sure the skill and PDDL definition match."
             )
 
         actions[idx, ac_idx : ac_idx + action.n_args] = torch.tensor(
@@ -144,15 +155,15 @@ def _apply_postcond(
 
     def should_terminate(
         self,
-        observations,
-        rnn_hidden_states,
-        prev_actions,
-        masks,
-        actions,
-        hl_policy_wants_termination,
+        observations: Observations,
+        rnn_hidden_states: torch.Tensor,
+        prev_actions: torch.Tensor,
+        masks: torch.Tensor,
+        actions: torch.Tensor,
+        hl_policy_wants_termination: torch.BoolTensor,
         batch_idx: List[int],
         skill_name: List[str],
-        log_info,
+        log_info: List[Dict[str, Any]],
     ) -> Tuple[torch.BoolTensor, torch.BoolTensor, torch.Tensor]:
         """
         :returns: Both of the BoolTensor's will be on the CPU.
@@ -171,10 +182,6 @@ def should_terminate(
             )
 
         cur_skill_step = self._cur_skill_step[batch_idx]
-        bad_terminate = torch.zeros(
-            cur_skill_step.shape,
-            dtype=torch.bool,
-        )
 
         bad_terminate = torch.zeros(
             cur_skill_step.shape,

From 755acb4aa0af9ef62055cf95305d458700b6ea83 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Fri, 27 Jan 2023 22:16:46 -0800
Subject: [PATCH 19/48] Resolved storage problem

---
 .../habitat_baselines/common/baseline_registry.py           | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/common/baseline_registry.py b/habitat-baselines/habitat_baselines/common/baseline_registry.py
index 17b5697084..26554f6054 100644
--- a/habitat-baselines/habitat_baselines/common/baseline_registry.py
+++ b/habitat-baselines/habitat_baselines/common/baseline_registry.py
@@ -143,11 +143,7 @@ def register_storage(cls, to_register=None, *, name: Optional[str] = None):
         trainer and then for fetching data batches for the updater.
         """
 
-        from habitat_baselines.common.rollout_storage import RolloutStorage
-
-        return cls._register_impl(
-            "storage", to_register, name, assert_type=RolloutStorage
-        )
+        return cls._register_impl("storage", to_register, name)
 
     @classmethod
     def get_storage(cls, name: str):

From 4754fd77c6d640a77a7ccfa6da9f1211bd034619 Mon Sep 17 00:00:00 2001
From: Xavier Puig <xavierpuig@fb.com>
Date: Sun, 29 Jan 2023 19:46:41 -0800
Subject: [PATCH 20/48] Update oracle_nav.py

---
 habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
index ffcd805da9..be26af308a 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
@@ -88,7 +88,7 @@ def from_config(
         cls, config, observation_space, action_space, batch_size, full_config
     ):
         filtered_action_space = ActionSpace(
-            {config.action_name: action_space[config.nav_action_name]}
+            {config.action_name: action_space[config.action_name]}
         )
         baselines_logger.debug(
             f"Loaded action space {filtered_action_space} for skill {config.skill_name}"

From 587672eb0f5f72e08c1e751007184711e4ed9b4f Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Sun, 29 Jan 2023 20:03:39 -0800
Subject: [PATCH 21/48] Fix for agent rotation

---
 .../habitat_baselines/rl/ppo/policy.py        |  9 ++++++--
 .../habitat_baselines/rl/ppo/ppo_trainer.py   |  2 +-
 .../rl/ver/inference_worker.py                | 22 ++++++++-----------
 .../habitat_baselines/rl/ver/ver_trainer.py   |  3 ++-
 .../habitat/tasks/rearrange/rearrange_task.py |  3 +++
 5 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/rl/ppo/policy.py b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
index ecfd261063..9ab6b7c144 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/policy.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
@@ -34,7 +34,12 @@
 @dataclass
 class PolicyActionData:
     """
-    Information returned from the `Policy` class.
+    Information returned from the `Policy.act` method representing the
+    information from an agent's action.
+
+    :property should_inserts: Of shape [# envs, 1]. If False at environment
+        index `i`, then don't write this transition to the rollout buffer. If
+        `None`, then write all data.
     """
 
     rnn_hidden_states: torch.Tensor
@@ -43,7 +48,7 @@ class PolicyActionData:
     action_log_probs: Optional[torch.Tensor] = None
     take_actions: Optional[torch.Tensor] = None
     policy_info: Optional[List[Dict[str, Any]]] = None
-    should_inserts: Optional[torch.Tensor] = None
+    should_inserts: Optional[torch.BoolTensor] = None
 
     def write_action(self, write_idx, write_action):
         self.actions[:, write_idx] = write_action
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
index 1adbc11c38..114845a9b6 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
@@ -206,7 +206,7 @@ def _setup_actor_critic_agent(self, ppo_cfg: "DictConfig") -> None:
 
         self.agent = agent_cls.from_config(self.actor_critic, ppo_cfg)
         self.policy_action_space = self.actor_critic.get_policy_action_space(
-            self.envs.action_spaces[0]
+            self.env_action_space
         )
 
     def _init_envs(self, config=None, is_eval: bool = False):
diff --git a/habitat-baselines/habitat_baselines/rl/ver/inference_worker.py b/habitat-baselines/habitat_baselines/rl/ver/inference_worker.py
index 56ef3f7025..e4880156d0 100644
--- a/habitat-baselines/habitat_baselines/rl/ver/inference_worker.py
+++ b/habitat-baselines/habitat_baselines/rl/ver/inference_worker.py
@@ -318,13 +318,7 @@ def step(self) -> Tuple[bool, List[Tuple[int, int]]]:
                     PointNavResNetNet.PRETRAINED_VISUAL_FEATURES_KEY
                 ] = self.visual_encoder(obs)
 
-            (
-                values,
-                actions,
-                actions_log_probs,
-                next_recurrent_hidden_states,
-                policy_info,
-            ) = self.actor_critic.act(
+            action_data = self.actor_critic.act(
                 obs,
                 recurrent_hidden_states,
                 prev_actions,
@@ -333,10 +327,12 @@ def step(self) -> Tuple[bool, List[Tuple[int, int]]]:
 
             if not final_batch:
                 self.rollouts.next_hidden_states.index_copy_(
-                    0, environment_ids, next_recurrent_hidden_states
+                    0,
+                    environment_ids,
+                    action_data.rnn_hidden_states,
                 )
                 self.rollouts.next_prev_actions.index_copy_(
-                    0, environment_ids, actions
+                    0, environment_ids, action_data.actions
                 )
 
             if self._variable_experience:
@@ -348,7 +344,7 @@ def step(self) -> Tuple[bool, List[Tuple[int, int]]]:
                     dtype=np.int64,
                 )
 
-            cpu_actions = actions.to(device="cpu")
+            cpu_actions = action_data.env_actions.to(device="cpu")
             self.transfer_buffers["actions"][
                 self.new_reqs
             ] = cpu_actions.numpy()
@@ -388,8 +384,8 @@ def step(self) -> Tuple[bool, List[Tuple[int, int]]]:
                 dict(
                     masks=to_batch["masks"],
                     observations=obs,
-                    actions=actions,
-                    action_log_probs=actions_log_probs,
+                    actions=action_data.actions,
+                    action_log_probs=action_data.actions_log_probs,
                     recurrent_hidden_states=recurrent_hidden_states,
                     prev_actions=prev_actions,
                     policy_version=self.rollouts.current_policy_version.expand(
@@ -398,7 +394,7 @@ def step(self) -> Tuple[bool, List[Tuple[int, int]]]:
                     episode_ids=to_batch["episode_ids"],
                     environment_ids=to_batch["environment_ids"],
                     step_ids=to_batch["step_ids"],
-                    value_preds=values,
+                    value_preds=action_data.values,
                     returns=torch.full(
                         (),
                         float("nan"),
diff --git a/habitat-baselines/habitat_baselines/rl/ver/ver_trainer.py b/habitat-baselines/habitat_baselines/rl/ver/ver_trainer.py
index 504bdf71d5..73c50ef4ca 100644
--- a/habitat-baselines/habitat_baselines/rl/ver/ver_trainer.py
+++ b/habitat-baselines/habitat_baselines/rl/ver/ver_trainer.py
@@ -181,7 +181,8 @@ def _init_train(self, resume_state):
         action_space = init_reports[0]["act_space"]
 
         self.policy_action_space = action_space
-        self.orig_policy_action_space = None
+        self.env_action_space = action_space
+        self.orig_env_action_space = None
 
         [
             ew.set_action_plugin(
diff --git a/habitat-lab/habitat/tasks/rearrange/rearrange_task.py b/habitat-lab/habitat/tasks/rearrange/rearrange_task.py
index 1c9d38f2a4..4563bc490f 100644
--- a/habitat-lab/habitat/tasks/rearrange/rearrange_task.py
+++ b/habitat-lab/habitat/tasks/rearrange/rearrange_task.py
@@ -96,6 +96,9 @@ def __init__(
             # Duplicate sensors that handle robots. One for each robot.
             self._duplicate_sensor_suite(self.sensor_suite)
 
+    def overwrite_sim_config(self, config: Any, episode: Episode) -> Any:
+        return config
+
     @property
     def targ_idx(self):
         return self._targ_idx

From f16bc1473bad2a97c7684aa74de1ec5140b578bd Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Mon, 30 Jan 2023 13:08:13 -0800
Subject: [PATCH 22/48] Missing key

---
 habitat-lab/habitat/config/default_structured_configs.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/habitat-lab/habitat/config/default_structured_configs.py b/habitat-lab/habitat/config/default_structured_configs.py
index 69e8dbcd6b..f56dc7eb70 100644
--- a/habitat-lab/habitat/config/default_structured_configs.py
+++ b/habitat-lab/habitat/config/default_structured_configs.py
@@ -153,6 +153,8 @@ class OracleNavActionConfig(ActionConfig):
     ang_speed: float = 10.0
     allow_dyn_slide: bool = True
     allow_back: bool = True
+    spawn_max_dist_to_obj: float = 2.0
+    num_spawn_attempts: int = 200
 
 
 # -----------------------------------------------------------------------------

From 22cb83f1190a4b43534295a5adfc2813ff0bcbac Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Mon, 30 Jan 2023 22:18:13 -0800
Subject: [PATCH 23/48] More docs

---
 .../defined_skills/oracle_skills.yaml         |  3 +++
 .../habitat_baselines/rl/hrl/__init__.py      |  2 +-
 .../habitat_baselines/rl/ppo/policy.py        | 22 ++++++++++++++++++-
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
index d3f9c34723..98985a5eec 100644
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
@@ -1,3 +1,6 @@
+# Oracle skills that will teleport to the skill post-condition. When automatically setting predicates you may want to run the simulation in kinematic mode:
+# To run in kinematic mode, add: `habitat.simulator.kinematic_mode=True habitat.simulator.ac_freq_ratio=1`
+
 open_cab:
   skill_name: "NoopSkillPolicy"
   max_skill_steps: 1
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/__init__.py b/habitat-baselines/habitat_baselines/rl/hrl/__init__.py
index 394540d026..39a3f0c90d 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/__init__.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/__init__.py
@@ -8,6 +8,6 @@
 from habitat_baselines.rl.hrl.hrl_rollout_storage import HrlRolloutStorage
 
 __all__ = [
-    "HrlPPO",
+    "HRLPPO",
     "HrlRolloutStorage",
 ]
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/policy.py b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
index 9ab6b7c144..21e621488b 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/policy.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
@@ -40,6 +40,19 @@ class PolicyActionData:
     :property should_inserts: Of shape [# envs, 1]. If False at environment
         index `i`, then don't write this transition to the rollout buffer. If
         `None`, then write all data.
+    :property policy_info`: Optional logging information about the policy per
+        environment. For example, you could log the policy entropy.
+    :property take_actions`: If specified, these actions will be executed in
+        the environment, but not stored in the storage buffer. This allows
+        exectuing and learning from different actions. If not specified, the
+        agent will execute `self.actions`.
+    :property values: The actor value predictions. None if the actor does not predict value.
+    :property actions: The actions to store in the storage buffer. if
+        `take_actions` is None, then this is also the action executed in the
+        environment.
+    :property rnn_hidden_states: Actor hidden states.
+    :property action_log_probs: The log probabilities of the actions under the
+        current policy.
     """
 
     rnn_hidden_states: torch.Tensor
@@ -50,11 +63,18 @@ class PolicyActionData:
     policy_info: Optional[List[Dict[str, Any]]] = None
     should_inserts: Optional[torch.BoolTensor] = None
 
-    def write_action(self, write_idx, write_action):
+    def write_action(self, write_idx: int, write_action: torch.Tensor) -> None:
+        """
+        Used to override an action across all environments.
+        """
         self.actions[:, write_idx] = write_action
 
     @property
     def env_actions(self) -> torch.Tensor:
+        """
+        The actions to execute in the environment.
+        """
+
         if self.take_actions is None:
             return self.actions
         else:

From df8b7a6c75ec1d663027c877935cf9c1e8a80da7 Mon Sep 17 00:00:00 2001
From: Andrew Szot <ASzot@users.noreply.github.com>
Date: Mon, 30 Jan 2023 22:20:10 -0800
Subject: [PATCH 24/48] Update
 habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py

Co-authored-by: Vincent-Pierre BERGES <28320361+vincentpierre@users.noreply.github.com>
---
 .../habitat_baselines/rl/hrl/hrl_rollout_storage.py             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
index 1de4f6b56d..959c041d98 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
@@ -58,7 +58,7 @@ def insert(
         will ignore the sample. If None, this defaults to the last insert
         state.
 
-        Rewards are summed when `should_insert[i] == False`.
+        Rewards acquired of steps where `should_insert[i] == False` will be summed up and added to the next step where `should_insert[i] == True`
         """
 
         if next_masks is not None:

From 422c036f7b971f439b9a2b5d7c77aa008fe50a79 Mon Sep 17 00:00:00 2001
From: Andrew Szot <ASzot@users.noreply.github.com>
Date: Mon, 30 Jan 2023 22:20:21 -0800
Subject: [PATCH 25/48] Update
 habitat-baselines/habitat_baselines/rl/hrl/utils.py

Co-authored-by: Vincent-Pierre BERGES <28320361+vincentpierre@users.noreply.github.com>
---
 habitat-baselines/habitat_baselines/rl/hrl/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/habitat-baselines/habitat_baselines/rl/hrl/utils.py b/habitat-baselines/habitat_baselines/rl/hrl/utils.py
index d2b56d2f1e..2c1b64bfe2 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/utils.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/utils.py
@@ -13,7 +13,7 @@ def find_action_range(
 ) -> Tuple[int, int]:
     """
     Returns the start and end indices of an action key in the action tensor. If
-    the key is not found. It returns None.
+    the key is not found, a Value error will be thrown.
     """
 
     start_idx = 0

From 5643b0eb6203ed9f7b91b0f3ec6b75d3b12722c2 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Mon, 30 Jan 2023 22:19:52 -0800
Subject: [PATCH 26/48] Updated name

---
 habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
index 31e742de5c..4acad90588 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
@@ -112,5 +112,5 @@ def reduce_loss(loss):
 
 
 @baseline_registry.register_updater
-class HrlDDPPO(DecentralizedDistributedMixin, HRLPPO):
+class HRLDDPPO(DecentralizedDistributedMixin, HRLPPO):
     pass

From ebe877eb093364812a84661b9e8745a0c48657a7 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Tue, 31 Jan 2023 22:26:35 -0800
Subject: [PATCH 27/48] fixes for training

---
 .../habitat_baselines/rl/policy/hl_neural.yaml     |  2 +-
 .../config/rearrange/rl_hierarchical.yaml          |  1 +
 .../habitat_baselines/rl/hrl/hl/neural_policy.py   |  1 +
 .../rl/hrl/hrl_rollout_storage.py                  | 14 +++++++++-----
 .../habitat_baselines/rl/ppo/policy.py             |  2 +-
 .../habitat_baselines/rl/ver/inference_worker.py   |  2 +-
 .../habitat/simulator/agents/rgbd_head_agent.yaml  |  6 ++++++
 7 files changed, 20 insertions(+), 8 deletions(-)
 create mode 100644 habitat-lab/habitat/config/habitat/simulator/agents/rgbd_head_agent.yaml

diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
index 4246ca0e03..2e7d9a4c87 100644
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
@@ -15,4 +15,4 @@ hierarchical_policy:
     num_rnn_layers: 2
     policy_input_keys:
       - "robot_head_depth"
-  defined_skills: MISSING
+  defined_skills: {}
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
index 70d35f7702..56d78df85a 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
@@ -28,6 +28,7 @@ habitat_baselines:
   force_torch_single_threaded: True
   eval_keys_to_include_in_name: ['reward', 'force', 'composite_success']
   load_resume_state_config: False
+  rollout_storage: "HrlRolloutStorage"
 
   eval:
     use_ckpt_config: False
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
index 3c6def42d2..15933f7354 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
@@ -159,6 +159,7 @@ def evaluate_actions(
             action_log_probs,
             distribution_entropy,
             rnn_hidden_states,
+            {},
         )
 
     def get_next_skill(
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
index 959c041d98..26dbbe9afc 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
@@ -10,7 +10,7 @@
 
 from habitat_baselines.common.baseline_registry import baseline_registry
 from habitat_baselines.common.rollout_storage import RolloutStorage
-from habitat_baselines.common.tensor_dict import DictTree
+from habitat_baselines.common.tensor_dict import DictTree, TensorDict
 from habitat_baselines.rl.models.rnn_state_encoder import (
     build_pack_info_from_dones,
     build_rnn_build_seq_info,
@@ -78,8 +78,12 @@ def insert(
             value_preds=value_preds,
         )
 
-        next_step = {k: v for k, v in next_step.items() if v is not None}
-        current_step = {k: v for k, v in current_step.items() if v is not None}
+        next_step = TensorDict(
+            {k: v for k, v in next_step.items() if v is not None}
+        )
+        current_step = TensorDict(
+            {k: v for k, v in current_step.items() if v is not None}
+        )
 
         if should_inserts is None:
             should_inserts = self._last_should_inserts
@@ -102,7 +106,7 @@ def insert(
                     self._cur_step_idxs[should_inserts] + 1,
                     env_idxs[should_inserts],
                 ),
-                next_step,
+                next_step[should_inserts],
                 strict=False,
             )
 
@@ -112,7 +116,7 @@ def insert(
                     self._cur_step_idxs[should_inserts],
                     env_idxs[should_inserts],
                 ),
-                current_step,
+                current_step[should_inserts],
                 strict=False,
             )
         self._last_should_inserts = should_inserts
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/policy.py b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
index 21e621488b..e31120c932 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/policy.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
@@ -230,7 +230,7 @@ def act(
         action_log_probs = distribution.log_probs(action)
         return PolicyActionData(
             values=value,
-            actions=value,
+            actions=action,
             action_log_probs=action_log_probs,
             rnn_hidden_states=rnn_hidden_states,
         )
diff --git a/habitat-baselines/habitat_baselines/rl/ver/inference_worker.py b/habitat-baselines/habitat_baselines/rl/ver/inference_worker.py
index e4880156d0..d8646af772 100644
--- a/habitat-baselines/habitat_baselines/rl/ver/inference_worker.py
+++ b/habitat-baselines/habitat_baselines/rl/ver/inference_worker.py
@@ -385,7 +385,7 @@ def step(self) -> Tuple[bool, List[Tuple[int, int]]]:
                     masks=to_batch["masks"],
                     observations=obs,
                     actions=action_data.actions,
-                    action_log_probs=action_data.actions_log_probs,
+                    action_log_probs=action_data.action_log_probs,
                     recurrent_hidden_states=recurrent_hidden_states,
                     prev_actions=prev_actions,
                     policy_version=self.rollouts.current_policy_version.expand(
diff --git a/habitat-lab/habitat/config/habitat/simulator/agents/rgbd_head_agent.yaml b/habitat-lab/habitat/config/habitat/simulator/agents/rgbd_head_agent.yaml
new file mode 100644
index 0000000000..338bbd2dd3
--- /dev/null
+++ b/habitat-lab/habitat/config/habitat/simulator/agents/rgbd_head_agent.yaml
@@ -0,0 +1,6 @@
+# @package habitat.simulator.agents.rgb_head_agent
+
+defaults:
+  - agent_base
+  - /habitat/simulator/sim_sensors@sim_sensors.head_rgb_sensor: head_rgb_sensor
+  - /habitat/simulator/sim_sensors@sim_sensors.head_depth_sensor: head_depth_sensor

From e1a8727bb72a234b7ebe3680cdee99900d72149c Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Wed, 1 Feb 2023 16:24:19 -0800
Subject: [PATCH 28/48] Fixed env issue

---
 .../habitat_baselines/config/rearrange/rl_hierarchical.yaml     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
index 56d78df85a..bb980e2713 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
@@ -19,7 +19,7 @@ habitat_baselines:
   torch_gpu_id: 0
   video_fps: 30
   eval_ckpt_path_dir: ""
-  num_environments: 3
+  num_environments: 4
   writer_type: 'tb'
   num_updates: -1
   total_num_steps: 1.0e8

From 2673f4fc0feee886d9fd73841a2502f0f4dac6a7 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Thu, 2 Feb 2023 12:49:19 -0800
Subject: [PATCH 29/48] Fixed deprecated configs

---
 .../habitat_baselines/config/rearrange/rl_skill.yaml          | 4 ++--
 habitat-lab/habitat/config/README.md                          | 1 -
 habitat-lab/habitat/config/default_structured_configs.py      | 4 ----
 .../habitat/config/habitat/task/rearrange/pick_spa.yaml       | 4 ----
 habitat-lab/habitat/config/habitat/task/rearrange/play.yaml   | 3 ---
 5 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_skill.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_skill.yaml
index 3bfc43f59e..08d7a89c73 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_skill.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_skill.yaml
@@ -18,8 +18,8 @@ habitat_baselines:
   eval_ckpt_path_dir: "data/new_checkpoints"
   # 26 environments will just barely be below 16gb.
   # num_environments: 26
-  # 20 environments will just barely be below 11gb.
-  num_environments: 20
+  # 18 environments will just barely be below 11gb.
+  num_environments: 18
   num_updates: -1
   total_num_steps: 1.0e8
   log_interval: 10
diff --git a/habitat-lab/habitat/config/README.md b/habitat-lab/habitat/config/README.md
index 16a1783ab6..3f6dfb4565 100644
--- a/habitat-lab/habitat/config/README.md
+++ b/habitat-lab/habitat/config/README.md
@@ -241,7 +241,6 @@ habitat:
     constraint_violation_drops_object: false
     force_regenerate: false
     should_save_to_cache: true
-    must_look_at_targ: true
     object_in_hand_sample_prob: 0.167
     render_target: true
     ee_sample_factor: 0.2
diff --git a/habitat-lab/habitat/config/default_structured_configs.py b/habitat-lab/habitat/config/default_structured_configs.py
index f56dc7eb70..e7ef02d9ae 100644
--- a/habitat-lab/habitat/config/default_structured_configs.py
+++ b/habitat-lab/habitat/config/default_structured_configs.py
@@ -723,7 +723,6 @@ class TaskConfig(HabitatBaseConfig):
     force_regenerate: bool = False
     # Saves the generated starts to a cache if they are not already generated
     should_save_to_cache: bool = False
-    must_look_at_targ: bool = True
     object_in_hand_sample_prob: float = 0.167
     min_start_distance: float = 3.0
     gfx_replay_dir = "data/replays"
@@ -746,9 +745,6 @@ class TaskConfig(HabitatBaseConfig):
     cache_robot_init: bool = False
     success_state: float = 0.0
     # Measurements for composite tasks.
-    # If true, does not care about navigability or collisions
-    # with objects when spawning robot
-    easy_init: bool = False
     should_enforce_target_within_reach: bool = False
     # COMPOSITE task CONFIG
     task_spec_base_path: str = "habitat/task/rearrange/pddl/"
diff --git a/habitat-lab/habitat/config/habitat/task/rearrange/pick_spa.yaml b/habitat-lab/habitat/config/habitat/task/rearrange/pick_spa.yaml
index f720718123..a9879d7c44 100644
--- a/habitat-lab/habitat/config/habitat/task/rearrange/pick_spa.yaml
+++ b/habitat-lab/habitat/config/habitat/task/rearrange/pick_spa.yaml
@@ -36,10 +36,6 @@ base_angle_noise: 0.15
 base_noise: 0.05
 force_regenerate: False
 
-# If true, does not care about navigability or collisions with objects when spawning
-# robot
-easy_init: False
-
 actions:
   arm_action:
     arm_controller: "ArmAbsPosKinematicAction"
diff --git a/habitat-lab/habitat/config/habitat/task/rearrange/play.yaml b/habitat-lab/habitat/config/habitat/task/rearrange/play.yaml
index 639e87ecd1..74d4304263 100644
--- a/habitat-lab/habitat/config/habitat/task/rearrange/play.yaml
+++ b/habitat-lab/habitat/config/habitat/task/rearrange/play.yaml
@@ -26,9 +26,6 @@ base_angle_noise: 0.0
 base_noise: 0.0
 constraint_violation_ends_episode: False
 
-# If true, does not care about navigability or collisions
-# with objects when spawning robot
-easy_init: False
 force_regenerate: True
 
 actions:

From dd01d50525d847a6a7bb77acaee60afe066f8b8c Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Thu, 2 Feb 2023 16:05:47 -0800
Subject: [PATCH 30/48] Speed fix

---
 .../config/default_structured_configs.py      |  1 +
 .../habitat/tasks/rearrange/rearrange_sim.py  | 26 ++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/habitat-lab/habitat/config/default_structured_configs.py b/habitat-lab/habitat/config/default_structured_configs.py
index c8d4369172..421ecb313e 100644
--- a/habitat-lab/habitat/config/default_structured_configs.py
+++ b/habitat-lab/habitat/config/default_structured_configs.py
@@ -1074,6 +1074,7 @@ class SimulatorConfig(HabitatBaseConfig):
     create_renderer: bool = False
     requires_textures: bool = True
     auto_sleep: bool = False
+    sleep_dist: float = 3.0
     step_physics: bool = True
     concur_render: bool = False
     # If markers should be updated at every step:
diff --git a/habitat-lab/habitat/tasks/rearrange/rearrange_sim.py b/habitat-lab/habitat/tasks/rearrange/rearrange_sim.py
index 3c6bf26661..e6e279fb94 100644
--- a/habitat-lab/habitat/tasks/rearrange/rearrange_sim.py
+++ b/habitat-lab/habitat/tasks/rearrange/rearrange_sim.py
@@ -143,6 +143,22 @@ def _try_acquire_context(self):
         if self.habitat_config.concur_render:
             self.renderer.acquire_gl_context()
 
+    def _auto_sleep(self):
+        all_robo_pos = [
+            robot.base_pos for robot in self.robots_mgr.robots_iter
+        ]
+        rom = self.get_rigid_object_manager()
+        for handle, ro in rom.get_objects_by_handle_substring().items():
+            is_far = any(
+                (robo_pos - ro.translation).length()
+                > self.habitat_config.sleep_dist
+                for robo_pos in all_robo_pos
+            )
+            if is_far:
+                ro.motion_type = habitat_sim.physics.MotionType.STATIC
+            else:
+                ro.motion_type = self._obj_orig_motion_types[handle]
+
     def sleep_all_objects(self):
         """
         De-activate (sleep) all rigid objects in the scene, assuming they are already in a dynamically stable state.
@@ -150,6 +166,7 @@ def sleep_all_objects(self):
         rom = self.get_rigid_object_manager()
         for _, ro in rom.get_objects_by_handle_substring().items():
             ro.awake = False
+
         aom = self.get_articulated_object_manager()
         for _, ao in aom.get_objects_by_handle_substring().items():
             ao.awake = False
@@ -242,6 +259,12 @@ def reconfigure(self, config: "DictConfig", ep_info: RearrangeEpisode):
         if self.habitat_config.auto_sleep:
             self.sleep_all_objects()
 
+        rom = self.get_rigid_object_manager()
+        self._obj_orig_motion_types = {
+            handle: ro.motion_type
+            for handle, ro in rom.get_objects_by_handle_substring().items()
+        }
+
         if new_scene:
             self._load_navmesh(ep_info)
 
@@ -662,6 +685,8 @@ def step(self, action: Union[str, int]) -> Observations:
             self.viz_ids = defaultdict(lambda: None)
 
         self.maybe_update_robot()
+        if self.habitat_config.sleep_dist > 0.0:
+            self._auto_sleep()
 
         if self.habitat_config.concur_render:
             self._prev_sim_obs = self.start_async_render()
@@ -751,7 +776,6 @@ def internal_step(
 
         Never call sim.step_world directly or miss updating the robot.
         """
-
         # optionally step physics and update the robot for benchmarking purposes
         if self.habitat_config.step_physics:
             self.step_world(dt)

From b20fcb177e0934ffb5095e3a24e100fd288d3199 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Thu, 2 Feb 2023 18:44:30 -0800
Subject: [PATCH 31/48] Updated configs

---
 .../defined_skills/oracle_skills.yaml                    | 2 +-
 habitat-lab/habitat/config/default_structured_configs.py | 9 +--------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
index 98985a5eec..b7b6357e48 100644
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
@@ -1,5 +1,5 @@
 # Oracle skills that will teleport to the skill post-condition. When automatically setting predicates you may want to run the simulation in kinematic mode:
-# To run in kinematic mode, add: `habitat.simulator.kinematic_mode=True habitat.simulator.ac_freq_ratio=1`
+# To run in kinematic mode, add: `habitat.simulator.kinematic_mode=True habitat.simulator.ac_freq_ratio=1 habitat.task.measurements.force_terminate.max_accum_force=-1.0 habitat.task.measurements.force_terminate.max_instant_force=-1.0`
 
 open_cab:
   skill_name: "NoopSkillPolicy"
diff --git a/habitat-lab/habitat/config/default_structured_configs.py b/habitat-lab/habitat/config/default_structured_configs.py
index 421ecb313e..9e2862b6fe 100644
--- a/habitat-lab/habitat/config/default_structured_configs.py
+++ b/habitat-lab/habitat/config/default_structured_configs.py
@@ -780,13 +780,6 @@ class CompositeSuccessMeasurementConfig(MeasurementConfig):
     must_call_stop: bool = True
 
 
-@dataclass
-class CompositeRewardMeasurementConfig(MeasurementConfig):
-    type: str = "CompositeReward"
-    must_call_stop: bool = True
-    success_reward: float = 10.0
-
-
 @dataclass
 class DoesWantTerminateMeasurementConfig(MeasurementConfig):
     type: str = "DoesWantTerminate"
@@ -1074,7 +1067,7 @@ class SimulatorConfig(HabitatBaseConfig):
     create_renderer: bool = False
     requires_textures: bool = True
     auto_sleep: bool = False
-    sleep_dist: float = 3.0
+    sleep_dist: float = -1.0
     step_physics: bool = True
     concur_render: bool = False
     # If markers should be updated at every step:

From 7982f4059ce7f695f6fc2664580bb5256142750d Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Fri, 3 Feb 2023 12:54:13 -0800
Subject: [PATCH 32/48] Pddl action fixes

---
 .../hierarchical_policy/defined_skills/oracle_skills.yaml     | 4 ++++
 .../config/rearrange/rl_hierarchical_oracle_nav.yaml          | 1 -
 habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py    | 3 +++
 .../habitat/config/habitat/task/rearrange/rearrange_easy.yaml | 1 +
 4 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
index b7b6357e48..27cb7b8060 100644
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
@@ -1,6 +1,10 @@
 # Oracle skills that will teleport to the skill post-condition. When automatically setting predicates you may want to run the simulation in kinematic mode:
 # To run in kinematic mode, add: `habitat.simulator.kinematic_mode=True habitat.simulator.ac_freq_ratio=1 habitat.task.measurements.force_terminate.max_accum_force=-1.0 habitat.task.measurements.force_terminate.max_instant_force=-1.0`
 
+defaults:
+  - /habitat/task/actions:
+    - pddl_apply_action
+
 open_cab:
   skill_name: "NoopSkillPolicy"
   max_skill_steps: 1
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml
index 92b80bea64..0f6ddbdbfb 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical_oracle_nav.yaml
@@ -9,7 +9,6 @@
 defaults:
   - rl_hierarchical
   - /habitat/task/actions:
-    - pddl_apply_action
     - oracle_nav_action
   - _self_
 
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index b912640359..0ceadb67ee 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -48,6 +48,9 @@ def __init__(
             )
         else:
             self._pddl_ac_start = None
+        if self._config.apply_postconds and self._pddl_ac_start is None:
+            raise ValueError(f"Could not find PDDL action in skill {self}")
+
         self._delay_term: List[Optional[bool]] = [
             None for _ in range(self._batch_size)
         ]
diff --git a/habitat-lab/habitat/config/habitat/task/rearrange/rearrange_easy.yaml b/habitat-lab/habitat/config/habitat/task/rearrange/rearrange_easy.yaml
index c138ca2238..e7dff55b48 100644
--- a/habitat-lab/habitat/config/habitat/task/rearrange/rearrange_easy.yaml
+++ b/habitat-lab/habitat/config/habitat/task/rearrange/rearrange_easy.yaml
@@ -19,6 +19,7 @@ defaults:
     - did_violate_hold_constraint
     - move_objects_reward
     - gfx_replay_measure
+    - composite_stage_goals
   - /habitat/task/lab_sensors:
     - relative_resting_pos_sensor
     - target_start_sensor

From 902bfa145ce3cfb5d00bc973e0cfe97aade00ae0 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Fri, 3 Feb 2023 20:14:49 -0800
Subject: [PATCH 33/48] Removed speed opts. Fixed some bugs

---
 .../config/default_structured_configs.py      |   3 +
 .../defined_skills/oracle_skills.yaml         |  10 +-
 .../rl/policy/hl_neural.yaml                  |   9 ++
 .../rl/hrl/hl/neural_policy.py                |   7 ++
 .../habitat_baselines/rl/hrl/skills/place.py  |   1 -
 .../habitat_baselines/rl/hrl/skills/skill.py  |  32 +++---
 .../config/default_structured_configs.py      |   1 -
 .../domain_configs/replica_cad.yaml           | 104 ++++++++++++++++++
 .../habitat/tasks/rearrange/rearrange_sim.py  |  18 ---
 9 files changed, 151 insertions(+), 34 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/config/default_structured_configs.py b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
index 99e4e8d781..6563f3639b 100644
--- a/habitat-baselines/habitat_baselines/config/default_structured_configs.py
+++ b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
@@ -251,6 +251,9 @@ class HrlDefinedSkillConfig(HabitatBaselinesBaseConfig):
     stop_thresh: float = 0.001
     # For the reset_arm_skill
     reset_joint_state: List[float] = MISSING
+    # The set of PDDL action names (as defined in the PDDL domain file) that
+    # map to this skill. If not specified,the name of the skill must match the
+    # PDDL action name.
     pddl_action_names: Optional[List[str]] = None
 
 
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
index 27cb7b8060..ab4bae46ef 100644
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hierarchical_policy/defined_skills/oracle_skills.yaml
@@ -9,22 +9,30 @@ open_cab:
   skill_name: "NoopSkillPolicy"
   max_skill_steps: 1
   apply_postconds: True
+  force_end_on_timeout: False
+  pddl_action_names: ["open_cab_by_name"]
 
 open_fridge:
   skill_name: "NoopSkillPolicy"
   max_skill_steps: 1
   apply_postconds: True
+  force_end_on_timeout: False
+  pddl_action_names: ["open_fridge_by_name"]
 
 close_cab:
   skill_name: "NoopSkillPolicy"
   obs_skill_inputs: ["obj_start_sensor"]
   max_skill_steps: 1
+  force_end_on_timeout: False
+  pddl_action_names: ["close_cab_by_name"]
 
 close_fridge:
   skill_name: "NoopSkillPolicy"
   obs_skill_inputs: ["obj_start_sensor"]
   max_skill_steps: 1
   apply_postconds: True
+  force_end_on_timeout: False
+  pddl_action_names: ["close_fridge_by_name"]
 
 pick:
   skill_name: "NoopSkillPolicy"
@@ -51,7 +59,7 @@ nav_to_obj:
   apply_postconds: True
   force_end_on_timeout: False
   obs_skill_input_dim: 2
-  pddl_action_names: ["nav", "nav_to_receptacle"]
+  pddl_action_names: ["nav", "nav_to_receptacle_by_name"]
 
 reset_arm:
   skill_name: "ResetArmSkill"
diff --git a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
index 2e7d9a4c87..1ede79538b 100644
--- a/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
+++ b/habitat-baselines/habitat_baselines/config/habitat_baselines/rl/policy/hl_neural.yaml
@@ -6,6 +6,15 @@ obs_transforms:
 hierarchical_policy:
   high_level_policy:
     name: "NeuralHighLevelPolicy"
+    allowed_actions:
+      - nav
+      - pick
+      - place
+      - nav_to_receptacle_by_name
+      - open_fridge_by_name
+      - close_fridge_by_name
+      - open_cab_by_name
+      - close_cab_by_name
     allow_other_place: False
     hidden_dim: 512
     use_rnn: True
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
index 15933f7354..d06f2d7d90 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/neural_policy.py
@@ -1,3 +1,4 @@
+import logging
 from itertools import chain
 from typing import Any, List
 
@@ -7,6 +8,7 @@
 import torch.nn as nn
 
 from habitat.tasks.rearrange.multi_task.pddl_action import PddlAction
+from habitat_baselines.common.logging import baselines_logger
 from habitat_baselines.rl.ddppo.policy import resnet
 from habitat_baselines.rl.ddppo.policy.resnet_policy import ResNetEncoder
 from habitat_baselines.rl.hrl.hl.high_level_policy import HighLevelPolicy
@@ -81,6 +83,9 @@ def __init__(self, *args, **kwargs):
 
     def _setup_actions(self) -> List[PddlAction]:
         all_actions = self._pddl_prob.get_possible_actions()
+        all_actions = [
+            ac for ac in all_actions if ac.name in self._config.allowed_actions
+        ]
         if not self._config.allow_other_place:
             all_actions = [
                 ac
@@ -191,6 +196,8 @@ def get_next_skill(
             if should_plan != 1.0:
                 continue
             use_ac = self._all_actions[skill_sel[batch_idx]]
+            if baselines_logger.level >= logging.DEBUG:
+                baselines_logger.debug(f"HL Policy selected skill {use_ac}")
             next_skill[batch_idx] = self._skill_name_to_idx[use_ac.name]
             skill_args_data[batch_idx] = [
                 entity.name for entity in use_ac.param_values
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/place.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/place.py
index a8d069910c..a99ac1a8e1 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/place.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/place.py
@@ -46,7 +46,6 @@ def _is_skill_done(
         if is_done.sum() > 0:
             self._internal_log(
                 f"Terminating with {rel_resting_pos} and {is_holding}",
-                observations,
             )
         return is_done
 
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index 0ceadb67ee..6bc7777806 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -2,6 +2,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
 from typing import Any, Dict, List, Optional, Tuple
 
 import gym.spaces as spaces
@@ -71,7 +72,7 @@ def __init__(
             action_space, "rearrange_stop"
         )
 
-    def _internal_log(self, s, observations=None):
+    def _internal_log(self, s):
         baselines_logger.debug(
             f"Skill {self._config.skill_name} @ step {self._cur_skill_step}: {s}"
         )
@@ -181,7 +182,6 @@ def should_terminate(
         if is_skill_done.sum() > 0:
             self._internal_log(
                 f"Requested skill termination {is_skill_done}",
-                observations,
             )
 
         cur_skill_step = self._cur_skill_step[batch_idx]
@@ -192,33 +192,35 @@ def should_terminate(
             dtype=torch.bool,
         )
         if self._config.max_skill_steps > 0:
-            over_max_len = cur_skill_step > self._config.max_skill_steps
+            over_max_len = cur_skill_step >= self._config.max_skill_steps
             if self._config.force_end_on_timeout:
                 bad_terminate = over_max_len
             else:
                 is_skill_done = is_skill_done | over_max_len
+
+        is_skill_done |= hl_policy_wants_termination
+
         new_actions = torch.zeros_like(actions)
         for i, env_i in enumerate(batch_idx):
             if self._delay_term[env_i]:
+                self._internal_log(
+                    "Terminating skill due to delayed termination."
+                )
                 self._delay_term[env_i] = False
-                is_skill_done[i] = 1.0
-            elif (
-                self._config.apply_postconds
-                and is_skill_done[i] == 1.0
-                and hl_policy_wants_termination[i] == 0.0
-            ):
+                is_skill_done[i] = True
+            elif self._config.apply_postconds and is_skill_done[i]:
                 new_actions[i] = self._apply_postcond(
                     actions, log_info, skill_name[i], env_i, i
                 )
                 self._delay_term[env_i] = True
-                is_skill_done[i] = 0.0
-
-        is_skill_done |= hl_policy_wants_termination
+                is_skill_done[i] = False
+                self._internal_log(
+                    "Applying PDDL action and terminating on the next step."
+                )
 
         if bad_terminate.sum() > 0:
             self._internal_log(
                 f"Bad terminating due to timeout {cur_skill_step}, {bad_terminate}",
-                observations,
             )
         return is_skill_done, bad_terminate, new_actions
 
@@ -238,6 +240,10 @@ def on_enter(
         self._cur_skill_step[batch_idxs] = 0
         for i, batch_idx in enumerate(batch_idxs):
             self._raw_skill_args[batch_idx] = skill_arg[i]
+            if baselines_logger.level >= logging.DEBUG:
+                baselines_logger.debug(
+                    f"Entering skill {self} with arguments {skill_arg[i]}"
+                )
             self._cur_skill_args[batch_idx] = self._parse_skill_arg(
                 skill_arg[i]
             )
diff --git a/habitat-lab/habitat/config/default_structured_configs.py b/habitat-lab/habitat/config/default_structured_configs.py
index 9e2862b6fe..8940845ba4 100644
--- a/habitat-lab/habitat/config/default_structured_configs.py
+++ b/habitat-lab/habitat/config/default_structured_configs.py
@@ -1067,7 +1067,6 @@ class SimulatorConfig(HabitatBaseConfig):
     create_renderer: bool = False
     requires_textures: bool = True
     auto_sleep: bool = False
-    sleep_dist: float = -1.0
     step_physics: bool = True
     concur_render: bool = False
     # If markers should be updated at every step:
diff --git a/habitat-lab/habitat/tasks/rearrange/multi_task/domain_configs/replica_cad.yaml b/habitat-lab/habitat/tasks/rearrange/multi_task/domain_configs/replica_cad.yaml
index 2de18d0c9b..20a15150fa 100644
--- a/habitat-lab/habitat/tasks/rearrange/multi_task/domain_configs/replica_cad.yaml
+++ b/habitat-lab/habitat/tasks/rearrange/multi_task/domain_configs/replica_cad.yaml
@@ -310,3 +310,107 @@ actions:
       config_args:
         task.base_angle_noise: 0.0
         task.spawn_region_scale: 0.0
+
+  #########################################################
+  # Receptacle name only based variants of the receptacle skills. This does not
+  # require any information about knowing which objects the receptacle
+  # contains.
+  - name: nav_to_receptacle_by_name
+    parameters:
+      - name: marker
+        expr_type: art_obj_type
+      - name: robot
+        expr_type: robot_type
+    precondition: null
+    postcondition:
+      - robot_at(marker, robot)
+    task_info:
+      task: NavToObjTask-v0
+      task_def: "nav_to_obj"
+      config_args:
+        task.force_regenerate: True
+        task.should_save_to_cache: False
+
+  - name: open_fridge_by_name
+    parameters:
+      - name: fridge_id
+        expr_type: fridge_type
+      - name: robot
+        expr_type: robot_type
+    precondition:
+      expr_type: AND
+      sub_exprs:
+        - robot_at(fridge_id, robot)
+        - closed_fridge(fridge_id)
+    postcondition:
+      - opened_fridge(fridge_id)
+    task_info:
+      task_def: "open_fridge"
+      task: RearrangeOpenFridgeTask-v0
+      add_task_args:
+        marker: fridge_id
+      config_args:
+        task.base_angle_noise: 0.0
+        task.spawn_region_scale: 0.0
+
+  - name: close_fridge_by_name
+    parameters:
+      - name: fridge_id
+        expr_type: fridge_type
+      - name: robot
+        expr_type: robot_type
+    precondition:
+      expr_type: AND
+      sub_exprs:
+        - robot_at(fridge_id, robot)
+        - opened_fridge(fridge_id)
+    postcondition:
+      - closed_fridge(fridge_id)
+    task_info:
+      task_def: "close_fridge"
+      task: RearrangeCloseFridgeTask-v0
+      add_task_args:
+        marker: fridge_id
+      config_args:
+        task.base_angle_noise: 0.0
+        task.spawn_region_scale: 0.0
+
+  - name: open_cab_by_name
+    parameters:
+      - name: marker
+        expr_type: cab_type
+      - name: robot
+        expr_type: robot_type
+    precondition:
+      expr_type: AND
+      sub_exprs:
+        - robot_at(marker, robot)
+        - closed_cab(marker)
+    postcondition:
+      - opened_cab(marker)
+    task_info:
+      task_def: "open_cab"
+      task: RearrangeOpenDrawerTask-v0
+      config_args:
+        task.base_angle_noise: 0.0
+        task.spawn_region_scale: 0.0
+
+  - name: close_cab_by_name
+    parameters:
+      - name: marker
+        expr_type: cab_type
+      - name: robot
+        expr_type: robot_type
+    precondition:
+      expr_type: AND
+      sub_exprs:
+        - robot_at(marker, robot)
+        - opened_cab(marker)
+    postcondition:
+      - closed_cab(marker)
+    task_info:
+      task_def: "close_cab"
+      task: RearrangeCloseDrawerTask-v0
+      config_args:
+        task.base_angle_noise: 0.0
+        task.spawn_region_scale: 0.0
diff --git a/habitat-lab/habitat/tasks/rearrange/rearrange_sim.py b/habitat-lab/habitat/tasks/rearrange/rearrange_sim.py
index e6e279fb94..b9bd60daee 100644
--- a/habitat-lab/habitat/tasks/rearrange/rearrange_sim.py
+++ b/habitat-lab/habitat/tasks/rearrange/rearrange_sim.py
@@ -143,22 +143,6 @@ def _try_acquire_context(self):
         if self.habitat_config.concur_render:
             self.renderer.acquire_gl_context()
 
-    def _auto_sleep(self):
-        all_robo_pos = [
-            robot.base_pos for robot in self.robots_mgr.robots_iter
-        ]
-        rom = self.get_rigid_object_manager()
-        for handle, ro in rom.get_objects_by_handle_substring().items():
-            is_far = any(
-                (robo_pos - ro.translation).length()
-                > self.habitat_config.sleep_dist
-                for robo_pos in all_robo_pos
-            )
-            if is_far:
-                ro.motion_type = habitat_sim.physics.MotionType.STATIC
-            else:
-                ro.motion_type = self._obj_orig_motion_types[handle]
-
     def sleep_all_objects(self):
         """
         De-activate (sleep) all rigid objects in the scene, assuming they are already in a dynamically stable state.
@@ -685,8 +669,6 @@ def step(self, action: Union[str, int]) -> Observations:
             self.viz_ids = defaultdict(lambda: None)
 
         self.maybe_update_robot()
-        if self.habitat_config.sleep_dist > 0.0:
-            self._auto_sleep()
 
         if self.habitat_config.concur_render:
             self._prev_sim_obs = self.start_async_render()

From 74278bde54f476188039215dffd5bcc8d48504d3 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Sat, 4 Feb 2023 11:13:09 -0800
Subject: [PATCH 34/48] Fixed rendering text to the frame

---
 .../habitat_baselines/config/rearrange/rl_hierarchical.yaml   | 4 ++--
 habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py     | 4 +---
 habitat-lab/habitat/utils/render_wrapper.py                   | 4 ++++
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
index bb980e2713..611a64c2ec 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
@@ -22,9 +22,9 @@ habitat_baselines:
   num_environments: 4
   writer_type: 'tb'
   num_updates: -1
-  total_num_steps: 1.0e8
+  total_num_steps: 5.0e7
   log_interval: 10
-  num_checkpoints: 20
+  num_checkpoints: 10
   force_torch_single_threaded: True
   eval_keys_to_include_in_name: ['reward', 'force', 'composite_success']
   load_resume_state_config: False
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
index cf4dbcfe92..eb5577a917 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
@@ -1103,9 +1103,7 @@ def _eval_checkpoint(
                         frame = observations_to_image(
                             {k: v[i] * 0.0 for k, v in batch.items()}, infos[i]
                         )
-                    frame = overlay_frame(
-                        frame, extract_scalars_from_info(infos[i])
-                    )
+                    frame = overlay_frame(frame, infos[i])
                     rgb_frames[i].append(frame)
 
                 # episode ended
diff --git a/habitat-lab/habitat/utils/render_wrapper.py b/habitat-lab/habitat/utils/render_wrapper.py
index 8000093d99..63ad257b6c 100644
--- a/habitat-lab/habitat/utils/render_wrapper.py
+++ b/habitat-lab/habitat/utils/render_wrapper.py
@@ -66,6 +66,10 @@ def append_text_to_image(
 
 
 def overlay_frame(frame, info, additional=None):
+    """
+    Renders text from the `info` dictionary to the `frame` image.
+    """
+
     lines = []
     flattened_info = flatten_dict(info)
     for k, v in flattened_info.items():

From 49a71a42064d39a36a40a5c981a635714fc390ff Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Sat, 4 Feb 2023 11:27:17 -0800
Subject: [PATCH 35/48] Addressing Vince's PR comments

---
 habitat-baselines/habitat_baselines/README.md      |  13 +++++++------
 .../config/rearrange/rl_rearrange.yaml             |   2 +-
 ...4193555.s-164-67-210-79.resnet.ucla.edu.58058.0 | Bin 40 -> 0 bytes
 3 files changed, 8 insertions(+), 7 deletions(-)
 delete mode 100644 habitat-baselines/tb/events.out.tfevents.1674193555.s-164-67-210-79.resnet.ucla.edu.58058.0

diff --git a/habitat-baselines/habitat_baselines/README.md b/habitat-baselines/habitat_baselines/README.md
index bb0319da10..86cfd34ab7 100644
--- a/habitat-baselines/habitat_baselines/README.md
+++ b/habitat-baselines/habitat_baselines/README.md
@@ -62,17 +62,18 @@ To run the following examples, you need the [ReplicaCAD dataset](https://github.
 To train a high-level policy, while using pre-learned low-level skills (SRL baseline from [Habitat2.0](https://arxiv.org/abs/2106.14405)), you can run:
 
 ```bash
-python -u habitat_baselines/run.py \
-  --exp-config habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml \
+python -u habitat-baselines/habitat_baselines/run.py \
+  --exp-config habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml \
   --run-type train
 ```
-To run a rearrangement episode with oracle in both low- and high-level, you can run:
+To run a rearrangement episode with oracle low-level skills and a fixed task planner, run:
 
 ```bash
-python -u habitat_baselines/run.py \
-  --exp-config habitat_baselines/config/rearrange/rl_hl_srl_onav.yaml \
+python -u habitat-baselines/habitat_baselines/run.py \
+  --exp-config habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml \
   --run-type eval \
-  habitat_baselines/rl/policy=hierarchical_tp_noop_onav
+  habitat_baselines/rl/policy=hl_fixed \
+  habitat_baselines/rl/policy/hierarchical_policy/defined_skills=oracle_skills
 ```
 
 To change the task (like set table) that you train your skills on, you can change the line `/habitat/task/rearrange: rearrange_easy` to `/habitat/task/rearrange: set_table` in the defaults of your config.
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_rearrange.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_rearrange.yaml
index d68be219bb..30dfdcd50e 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_rearrange.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_rearrange.yaml
@@ -2,8 +2,8 @@
 
 defaults:
   - /benchmark/rearrange: rearrange
-  - /habitat_baselines/rl/policy: monolithic
   - /habitat_baselines: habitat_baselines_rl_config_base
+  - /habitat_baselines/rl/policy: monolithic
   - /habitat/simulator/sim_sensors@habitat_baselines.eval.extra_sim_sensors.third_rgb_sensor: third_rgb_sensor
   - _self_
 
diff --git a/habitat-baselines/tb/events.out.tfevents.1674193555.s-164-67-210-79.resnet.ucla.edu.58058.0 b/habitat-baselines/tb/events.out.tfevents.1674193555.s-164-67-210-79.resnet.ucla.edu.58058.0
deleted file mode 100644
index 11853e6cc35c6fa02416a8cca40abb3d004e18ad..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 40
rcmb1OfPlsI-b$P&Paay*_34JA6mL>dVrHJ6YguYuiIvgv2lx8`_;n8f


From b41133ad1db34fd4b88de15a93cce5daab88387a Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Sat, 4 Feb 2023 15:25:51 -0800
Subject: [PATCH 36/48] Refactored navigation to be much clearer

---
 .../rl/hrl/skills/oracle_nav.py               | 54 +++++--------------
 .../rearrange/actions/oracle_nav_action.py    | 45 ++++++++--------
 test/test_baseline_training.py                |  1 +
 3 files changed, 38 insertions(+), 62 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
index be26af308a..fe1364ae51 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
@@ -8,11 +8,7 @@
 import torch
 
 from habitat.core.spaces import ActionSpace
-from habitat.tasks.rearrange.actions.oracle_nav_action import (
-    get_possible_nav_to_actions,
-)
 from habitat.tasks.rearrange.multi_task.pddl_domain import PddlProblem
-from habitat.tasks.rearrange.multi_task.rearrange_pddl import RIGID_OBJ_TYPE
 from habitat.tasks.rearrange.rearrange_sensors import LocalizationSensor
 from habitat_baselines.common.logging import baselines_logger
 from habitat_baselines.rl.hrl.skills.nn_skill import NnSkillPolicy
@@ -23,9 +19,11 @@
 class OracleNavPolicy(NnSkillPolicy):
     @dataclass
     class OracleNavActionArgs:
+        """
+        :property action_idx: The index of the oracle action we want to execute
+        """
+
         action_idx: int
-        is_target_obj: bool
-        target_idx: int
 
     def __init__(
         self,
@@ -53,7 +51,7 @@ def __init__(
             pddl_task_path,
             task_config,
         )
-        self._poss_actions = get_possible_nav_to_actions(self._pddl_problem)
+        self._all_entities = self._pddl_problem.get_ordered_entities_list()
         self._oracle_nav_ac_idx, _ = find_action_range(
             action_space, "oracle_nav_action"
         )
@@ -130,47 +128,23 @@ def _is_skill_done(
         return ret
 
     def _parse_skill_arg(self, skill_arg):
-        marker = None
         if len(skill_arg) == 2:
-            targ_obj, _ = skill_arg
+            search_target, _ = skill_arg
         elif len(skill_arg) == 3:
-            marker, targ_obj, _ = skill_arg
+            _, search_target, _ = skill_arg
         else:
             raise ValueError(
                 f"Unexpected number of skill arguments in {skill_arg}"
             )
 
-        targ_obj_idx = int(targ_obj.split("|")[-1])
-
-        targ_obj = self._pddl_problem.get_entity(targ_obj)
-        if marker is not None:
-            marker = self._pddl_problem.get_entity(marker)
-
-        match_i = None
-        for i, action in enumerate(self._poss_actions):
-            match_obj = action.get_arg_value("obj")
-            if marker is not None:
-                match_marker = action.get_arg_value("marker")
-                if match_marker != marker:
-                    continue
-            if match_obj != targ_obj:
-                continue
-            match_i = i
-            break
-        if match_i is None:
-            raise ValueError(f"Cannot find matching action for {skill_arg}")
-        is_target_obj = targ_obj.expr_type.is_subtype_of(
-            self._pddl_problem.expr_types[RIGID_OBJ_TYPE]
-        )
-        return OracleNavPolicy.OracleNavActionArgs(
-            match_i, is_target_obj, targ_obj_idx
-        )
-
-    def _get_multi_sensor_index(self, batch_idx):
-        return [self._cur_skill_args[i].target_idx for i in batch_idx]
+        target = self._pddl_problem.get_entity(search_target)
+        if target is None:
+            raise ValueError(
+                f"Cannot find matching entity for {search_target}"
+            )
+        match_i = self._all_entities.index(target)
 
-    def requires_rnn_state(self):
-        return False
+        return OracleNavPolicy.OracleNavActionArgs(match_i)
 
     def _internal_act(
         self,
diff --git a/habitat-lab/habitat/tasks/rearrange/actions/oracle_nav_action.py b/habitat-lab/habitat/tasks/rearrange/actions/oracle_nav_action.py
index 19523e04c1..521aa5a5b7 100644
--- a/habitat-lab/habitat/tasks/rearrange/actions/oracle_nav_action.py
+++ b/habitat-lab/habitat/tasks/rearrange/actions/oracle_nav_action.py
@@ -14,32 +14,34 @@
 from habitat.tasks.utils import get_angle
 
 
-def compute_turn(rel, turn_vel, robot_forward):
-    is_left = np.cross(robot_forward, rel) > 0
-    if is_left:
-        vel = [0, -turn_vel]
-    else:
-        vel = [0, turn_vel]
-    return vel
-
-
-def get_possible_nav_to_actions(pddl_problem):
-    return pddl_problem.get_possible_actions(
-        allowed_action_names=["nav", "nav_to_receptacle"],
-        true_preds=None,
-    )
-
-
 @registry.register_task_action
 class OracleNavAction(BaseVelAction):
+    """
+    An action that will convert the index of an entity (in the sense of
+    `PddlEntity`) to navigate to and convert this to base control to move the
+    robot to the closest navigable position to that entity. The entity index is
+    the index into the list of all available entities in the current scene.
+    """
+
     def __init__(self, *args, task, **kwargs):
         super().__init__(*args, **kwargs)
         self._task = task
 
-        self._poss_actions = get_possible_nav_to_actions(task.pddl_problem)
+        self._poss_entities = (
+            self._task.pddl_problem.get_ordered_entities_list()
+        )
         self._prev_ep_id = None
         self._targets = {}
 
+    @staticmethod
+    def _compute_turn(rel, turn_vel, robot_forward):
+        is_left = np.cross(robot_forward, rel) > 0
+        if is_left:
+            vel = [0, -turn_vel]
+        else:
+            vel = [0, turn_vel]
+        return vel
+
     @property
     def action_space(self):
         return spaces.Dict(
@@ -62,8 +64,7 @@ def reset(self, *args, **kwargs):
 
     def _get_target_for_idx(self, nav_to_target_idx: int):
         if nav_to_target_idx not in self._targets:
-            action = self._poss_actions[nav_to_target_idx]
-            nav_to_obj = action.get_arg_value("obj")
+            nav_to_obj = self._poss_entities[nav_to_target_idx]
             obj_pos = self._task.pddl_problem.sim_info.get_entity_pos(
                 nav_to_obj
             )
@@ -95,7 +96,7 @@ def step(self, *args, is_last_action, **kwargs):
             self._action_arg_prefix + "oracle_nav_action"
         ]
         if nav_to_target_idx <= 0 or nav_to_target_idx > len(
-            self._poss_actions
+            self._poss_entities
         ):
             if is_last_action:
                 return self._sim.step(HabitatSimActions.base_velocity)
@@ -134,7 +135,7 @@ def step(self, *args, is_last_action, **kwargs):
         if not at_goal:
             if dist_to_final_nav_targ < self._config.dist_thresh:
                 # Look at the object
-                vel = compute_turn(
+                vel = OracleNavAction._compute_turn(
                     rel_pos, self._config.turn_velocity, robot_forward
                 )
             elif angle_to_target < self._config.turn_thresh:
@@ -142,7 +143,7 @@ def step(self, *args, is_last_action, **kwargs):
                 vel = [self._config.forward_velocity, 0]
             else:
                 # Look at the target waypoint.
-                vel = compute_turn(
+                vel = OracleNavAction._compute_turn(
                     rel_targ, self._config.turn_velocity, robot_forward
                 )
         else:
diff --git a/test/test_baseline_training.py b/test/test_baseline_training.py
index d377990317..0b9b7b08b4 100644
--- a/test/test_baseline_training.py
+++ b/test/test_baseline_training.py
@@ -174,6 +174,7 @@ def test_trainers(config_path, num_updates, overrides, trainer_name):
     ),
 )
 def test_hrl(config_path, policy_type, skill_type, mode):
+
     if policy_type == "hl_neural" and skill_type == "nn_skills":
         return
     if policy_type == "hl_fixed" and mode == "train":

From e7a877b3c31d2b9268c07a2e2117e0e09651852a Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Sun, 5 Feb 2023 13:12:50 -0800
Subject: [PATCH 37/48] Fixed some of the tests

---
 .../habitat_baselines/agents/ppo_agents.py           | 12 ++++--------
 .../sims/habitat_simulator/habitat_simulator.py      |  4 ++--
 habitat-lab/habitat/tasks/nav/nav.py                 |  4 +---
 test/test_baseline_config.py                         |  5 +++++
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/agents/ppo_agents.py b/habitat-baselines/habitat_baselines/agents/ppo_agents.py
index 7b25704cf2..f87a85ef3e 100644
--- a/habitat-baselines/habitat_baselines/agents/ppo_agents.py
+++ b/habitat-baselines/habitat_baselines/agents/ppo_agents.py
@@ -126,23 +126,19 @@ def reset(self) -> None:
     def act(self, observations: Observations) -> Dict[str, int]:
         batch = batch_obs([observations], device=self.device)
         with torch.no_grad():
-            (
-                _,
-                actions,
-                _,
-                self.test_recurrent_hidden_states,
-            ) = self.actor_critic.act(
+            action_data = self.actor_critic.act(
                 batch,
                 self.test_recurrent_hidden_states,
                 self.prev_actions,
                 self.not_done_masks,
                 deterministic=False,
             )
+            self.test_recurrent_hidden_states = action_data.rnn_hidden_states
             #  Make masks not done till reset (end of episode) will be called
             self.not_done_masks.fill_(True)
-            self.prev_actions.copy_(actions)  # type: ignore
+            self.prev_actions.copy_(action_data.actions)  # type: ignore
 
-        return {"action": actions[0][0].item()}
+        return {"action": action_data.env_actions[0][0].item()}
 
 
 def main():
diff --git a/habitat-lab/habitat/sims/habitat_simulator/habitat_simulator.py b/habitat-lab/habitat/sims/habitat_simulator/habitat_simulator.py
index d09ef21c42..027e4e82f4 100644
--- a/habitat-lab/habitat/sims/habitat_simulator/habitat_simulator.py
+++ b/habitat-lab/habitat/sims/habitat_simulator/habitat_simulator.py
@@ -390,8 +390,8 @@ def _update_agents_state(self) -> bool:
             agent_cfg = self.habitat_config.agents[agent_name]
             if agent_cfg.is_set_start_state:
                 self.set_agent_state(
-                    agent_cfg.start_position,
-                    agent_cfg.start_rotation,
+                    [float(k) for k in agent_cfg.start_position],
+                    [float(k) for k in agent_cfg.start_rotation],
                     agent_id,
                 )
                 is_updated = True
diff --git a/habitat-lab/habitat/tasks/nav/nav.py b/habitat-lab/habitat/tasks/nav/nav.py
index f4bcc4c572..c844c5a7f4 100644
--- a/habitat-lab/habitat/tasks/nav/nav.py
+++ b/habitat-lab/habitat/tasks/nav/nav.py
@@ -1317,9 +1317,7 @@ def overwrite_sim_config(self, config: Any, episode: Episode) -> Any:
             ):
                 agent_config = get_agent_config(config.simulator)
                 agent_config.start_position = episode.start_position
-                agent_config.start_rotation = [
-                    float(k) for k in episode.start_rotation
-                ]
+                agent_config.start_rotation = episode.start_rotation
                 agent_config.is_set_start_state = True
         return config
 
diff --git a/test/test_baseline_config.py b/test/test_baseline_config.py
index 533944bd78..15b3fb524b 100644
--- a/test/test_baseline_config.py
+++ b/test/test_baseline_config.py
@@ -25,4 +25,9 @@ def test_baselines_configs(test_cfg_path):
     cleaned_path = test_cfg_path.replace(
         "habitat-baselines/habitat_baselines/config/", ""
     )
+    if "habitat_baselines" in cleaned_path:
+        # Do not test non-standalone config options that are
+        # supposed to be used with "main" configs.
+        return
+
     get_config(cleaned_path)

From 5c213e3062b8c06a42df384e33de50f4484e9659 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Sun, 5 Feb 2023 21:41:20 -0800
Subject: [PATCH 38/48] Adddressed PR comments

---
 .../habitat_baselines/rl/hrl/hierarchical_policy.py  |  4 ++--
 .../habitat_baselines/rl/hrl/skills/reset.py         | 12 +++++++-----
 .../habitat_baselines/rl/hrl/skills/skill.py         |  4 ++--
 .../habitat_baselines/rl/ppo/ppo_trainer.py          |  5 +++++
 test/test_baseline_training.py                       |  4 ++++
 5 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
index e57a2dc5ef..bcad61b862 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hierarchical_policy.py
@@ -227,7 +227,7 @@ def act(
             (self._num_envs,), dtype=torch.bool
         )
 
-        hl_policy_wants_termination = self._high_level_policy.get_termination(
+        hl_wants_skill_term = self._high_level_policy.get_termination(
             observations,
             rnn_hidden_states,
             prev_actions,
@@ -249,7 +249,7 @@ def act(
                 "prev_actions": prev_actions,
                 "masks": masks,
                 "actions": actions,
-                "hl_policy_wants_termination": hl_policy_wants_termination,
+                "hl_wants_skill_term": hl_wants_skill_term,
             },
             # Only decide on skill termination if the episode is active.
             should_adds=masks,
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
index 39bf4ea058..937f0097c1 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/reset.py
@@ -21,10 +21,12 @@ def __init__(
         batch_size,
     ):
         super().__init__(config, action_space, batch_size, True)
-        self._target = np.array([float(x) for x in config.reset_joint_state])
+        self._rest_state = np.array(
+            [float(x) for x in config.reset_joint_state]
+        )
 
         self._arm_ac_range = find_action_range(action_space, "arm_action")
-        self._arm_ac_range = (self._arm_ac_range[0], self._target.shape[0])
+        self._arm_ac_range = (self._arm_ac_range[0], self._rest_state.shape[0])
 
     def on_enter(
         self,
@@ -43,7 +45,7 @@ def on_enter(
         )
 
         self._initial_delta = (
-            self._target - observations["joint"].cpu().numpy()
+            self._rest_state - observations["joint"].cpu().numpy()
         )
 
         return ret
@@ -58,7 +60,7 @@ def _is_skill_done(
 
         return (
             torch.as_tensor(
-                np.abs(current_joint_pos - self._target).max(-1),
+                np.abs(current_joint_pos - self._rest_state).max(-1),
                 dtype=torch.float32,
             )
             < 5e-2
@@ -74,7 +76,7 @@ def _internal_act(
         deterministic=False,
     ):
         current_joint_pos = observations["joint"].cpu().numpy()
-        delta = self._target - current_joint_pos
+        delta = self._rest_state - current_joint_pos
 
         # Dividing by max initial delta means that the action will
         # always in [-1,1] and has the benefit of reducing the delta
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index 6bc7777806..b1a8f87cd4 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -164,7 +164,7 @@ def should_terminate(
         prev_actions: torch.Tensor,
         masks: torch.Tensor,
         actions: torch.Tensor,
-        hl_policy_wants_termination: torch.BoolTensor,
+        hl_wants_skill_term: torch.BoolTensor,
         batch_idx: List[int],
         skill_name: List[str],
         log_info: List[Dict[str, Any]],
@@ -198,7 +198,7 @@ def should_terminate(
             else:
                 is_skill_done = is_skill_done | over_max_len
 
-        is_skill_done |= hl_policy_wants_termination
+        is_skill_done |= hl_wants_skill_term
 
         new_actions = torch.zeros_like(actions)
         for i, env_i in enumerate(batch_idx):
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
index eb5577a917..708e20f802 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
@@ -84,6 +84,11 @@
 class PPOTrainer(BaseRLTrainer):
     r"""Trainer class for PPO algorithm
     Paper: https://arxiv.org/abs/1707.06347.
+
+    :property env_action_space: The action space required by the environment.
+    :property policy_action_space: The action space the policy acts in. This
+        can be different from the environment action space for hierarchical
+        policies.
     """
     supported_tasks = ["Nav-v0"]
 
diff --git a/test/test_baseline_training.py b/test/test_baseline_training.py
index 0b9b7b08b4..aa1fdeccd9 100644
--- a/test/test_baseline_training.py
+++ b/test/test_baseline_training.py
@@ -174,6 +174,7 @@ def test_trainers(config_path, num_updates, overrides, trainer_name):
     ),
 )
 def test_hrl(config_path, policy_type, skill_type, mode):
+    TRAIN_LOG_FILE = "data/test_train.log"
 
     if policy_type == "hl_neural" and skill_type == "nn_skills":
         return
@@ -184,6 +185,8 @@ def test_hrl(config_path, policy_type, skill_type, mode):
     # Remove the checkpoints from previous tests
     for f in glob.glob("data/test_checkpoints/test_training/*"):
         os.remove(f)
+    if os.path.exists(TRAIN_LOG_FILE):
+        os.remove(TRAIN_LOG_FILE)
 
     # Setup the training
     config = get_config(
@@ -195,6 +198,7 @@ def test_hrl(config_path, policy_type, skill_type, mode):
             "habitat_baselines.total_num_steps=-1.0",
             "habitat_baselines.test_episode_count=1",
             "habitat_baselines.checkpoint_folder=data/test_checkpoints/test_training",
+            f"habitat_baselines.log_file={TRAIN_LOG_FILE}",
             f"habitat_baselines/rl/policy={policy_type}",
             f"habitat_baselines/rl/policy/hierarchical_policy/defined_skills={skill_type}",
         ],

From d9721f19ee6b464db16ca8ef4190b873a6c70e03 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Sun, 5 Feb 2023 21:48:01 -0800
Subject: [PATCH 39/48] Fixed rotation issue

---
 habitat-lab/habitat/tasks/nav/nav.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/habitat-lab/habitat/tasks/nav/nav.py b/habitat-lab/habitat/tasks/nav/nav.py
index c844c5a7f4..f4bcc4c572 100644
--- a/habitat-lab/habitat/tasks/nav/nav.py
+++ b/habitat-lab/habitat/tasks/nav/nav.py
@@ -1317,7 +1317,9 @@ def overwrite_sim_config(self, config: Any, episode: Episode) -> Any:
             ):
                 agent_config = get_agent_config(config.simulator)
                 agent_config.start_position = episode.start_position
-                agent_config.start_rotation = episode.start_rotation
+                agent_config.start_rotation = [
+                    float(k) for k in episode.start_rotation
+                ]
                 agent_config.is_set_start_state = True
         return config
 

From f8387defbd5ea590a7557acbb79ace241a8b2aa4 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Sun, 5 Feb 2023 21:52:01 -0800
Subject: [PATCH 40/48] Fixed black

---
 .../habitat_baselines/common/rollout_storage.py            | 1 -
 habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py      | 7 ++++++-
 habitat-baselines/habitat_baselines/rl/ppo/ppo.py          | 1 -
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/common/rollout_storage.py b/habitat-baselines/habitat_baselines/common/rollout_storage.py
index a146d9eb4a..8120b56494 100644
--- a/habitat-baselines/habitat_baselines/common/rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/common/rollout_storage.py
@@ -36,7 +36,6 @@ def __init__(
         num_recurrent_layers=1,
         is_double_buffered: bool = False,
     ):
-
         if is_continuous_action_space(action_space):
             # Assume ALL actions are NOT discrete
             action_shape = (
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
index 4acad90588..998933c459 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
@@ -28,7 +28,12 @@ def reduce_loss(loss):
 
         self._set_grads_to_none()
 
-        (values, action_log_probs, dist_entropy, _,) = self._evaluate_actions(
+        (
+            values,
+            action_log_probs,
+            dist_entropy,
+            _,
+        ) = self._evaluate_actions(
             batch["observations"],
             batch["recurrent_hidden_states"],
             batch["prev_actions"],
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/ppo.py b/habitat-baselines/habitat_baselines/rl/ppo/ppo.py
index b2d764ca6b..7affdc63ba 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/ppo.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/ppo.py
@@ -296,7 +296,6 @@ def update(
         self,
         rollouts: RolloutStorage,
     ) -> Dict[str, float]:
-
         advantages = self.get_advantages(rollouts)
 
         learner_metrics: Dict[str, List[Any]] = collections.defaultdict(list)

From 1c8f54cd0fb047b46255bdbfc498f7941f87ee7c Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Tue, 7 Feb 2023 17:45:57 -0800
Subject: [PATCH 41/48] Addressed PR comments

---
 .../habitat_baselines/common/rollout_storage.py | 15 ++-------------
 .../config/default_structured_configs.py        |  2 +-
 .../config/rearrange/rl_hierarchical.yaml       |  4 +++-
 .../rl/hrl/hl/high_level_policy.py              |  8 ++++++++
 .../habitat_baselines/rl/hrl/skills/skill.py    | 13 ++++++++-----
 .../habitat_baselines/rl/ppo/policy.py          |  2 ++
 .../habitat_baselines/rl/ppo/ppo_trainer.py     | 14 +++++---------
 .../habitat_baselines/utils/common.py           | 17 +++++++++++++++++
 8 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/common/rollout_storage.py b/habitat-baselines/habitat_baselines/common/rollout_storage.py
index 8120b56494..f48bd71042 100644
--- a/habitat-baselines/habitat_baselines/common/rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/common/rollout_storage.py
@@ -17,8 +17,8 @@
     build_rnn_build_seq_info,
 )
 from habitat_baselines.utils.common import (
+    get_action_space_info,
     get_num_actions,
-    is_continuous_action_space,
 )
 
 
@@ -36,18 +36,7 @@ def __init__(
         num_recurrent_layers=1,
         is_double_buffered: bool = False,
     ):
-        if is_continuous_action_space(action_space):
-            # Assume ALL actions are NOT discrete
-            action_shape = (
-                get_num_actions(
-                    action_space,
-                ),
-            )
-            discrete_actions = False
-        else:
-            # For discrete pointnav
-            action_shape = (1,)
-            discrete_actions = True
+        discrete_actions, action_shape = get_action_space_info(action_space)
 
         self.buffers = TensorDict()
         self.buffers["observations"] = TensorDict()
diff --git a/habitat-baselines/habitat_baselines/config/default_structured_configs.py b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
index 6563f3639b..6d774afa9b 100644
--- a/habitat-baselines/habitat_baselines/config/default_structured_configs.py
+++ b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
@@ -392,7 +392,7 @@ class HabitatBaselinesConfig(HabitatBaselinesBaseConfig):
     eval_ckpt_path_dir: str = "data/checkpoints"
     num_environments: int = 16
     num_processes: int = -1  # deprecated
-    rollout_storage: str = "RolloutStorage"
+    rollout_storage_name: str = "RolloutStorage"
     checkpoint_folder: str = "data/checkpoints"
     num_updates: int = 10000
     num_checkpoints: int = 10
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
index 611a64c2ec..f7f142a8a0 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
@@ -16,6 +16,8 @@ defaults:
 habitat_baselines:
   verbose: False
   trainer_name: "ddppo"
+  updater_name: "HRLPPO"
+  updater_name: "HRLDDPPO"
   torch_gpu_id: 0
   video_fps: 30
   eval_ckpt_path_dir: ""
@@ -28,7 +30,7 @@ habitat_baselines:
   force_torch_single_threaded: True
   eval_keys_to_include_in_name: ['reward', 'force', 'composite_success']
   load_resume_state_config: False
-  rollout_storage: "HrlRolloutStorage"
+  rollout_storage_name: "HrlRolloutStorage"
 
   eval:
     use_ckpt_config: False
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
index c148c62f28..ffc2874240 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
@@ -91,9 +91,17 @@ def get_next_skill(
         raise NotImplementedError()
 
     def apply_mask(self, mask: torch.Tensor) -> None:
+        """
+        Called before every step with the mask information at the current step.
+        """
+
         pass
 
     def get_policy_components(self) -> List[nn.Module]:
+        """
+        Gets the torch modules that are in the HL policy architecture.
+        """
+
         return []
 
     def get_termination(
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index b1a8f87cd4..9a1eb5647e 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -29,6 +29,9 @@ def __init__(
         """
         self._config = config
         self._batch_size = batch_size
+        self._apply_postcond = self._config.apply_postconds
+        self._force_end_on_timeout = self._config.force_end_on_timeout
+        self._max_skill_steps = self._config.max_skill_steps
 
         self._cur_skill_step = torch.zeros(self._batch_size)
         self._should_keep_hold_state = should_keep_hold_state
@@ -49,7 +52,7 @@ def __init__(
             )
         else:
             self._pddl_ac_start = None
-        if self._config.apply_postconds and self._pddl_ac_start is None:
+        if self._apply_postconds and self._pddl_ac_start is None:
             raise ValueError(f"Could not find PDDL action in skill {self}")
 
         self._delay_term: List[Optional[bool]] = [
@@ -191,9 +194,9 @@ def should_terminate(
             device=cur_skill_step.device,
             dtype=torch.bool,
         )
-        if self._config.max_skill_steps > 0:
-            over_max_len = cur_skill_step >= self._config.max_skill_steps
-            if self._config.force_end_on_timeout:
+        if self._max_skill_steps > 0:
+            over_max_len = cur_skill_step >= self._max_skill_steps
+            if self._force_end_on_timeout:
                 bad_terminate = over_max_len
             else:
                 is_skill_done = is_skill_done | over_max_len
@@ -208,7 +211,7 @@ def should_terminate(
                 )
                 self._delay_term[env_i] = False
                 is_skill_done[i] = True
-            elif self._config.apply_postconds and is_skill_done[i]:
+            elif self._apply_postconds and is_skill_done[i]:
                 new_actions[i] = self._apply_postcond(
                     actions, log_info, skill_name[i], env_i, i
                 )
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/policy.py b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
index e31120c932..fb83a36478 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/policy.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/policy.py
@@ -66,6 +66,8 @@ class PolicyActionData:
     def write_action(self, write_idx: int, write_action: torch.Tensor) -> None:
         """
         Used to override an action across all environments.
+        :param write_idx: The index in the action dimension to write the new action.
+        :param write_action: The action to write at `write_idx`.
         """
         self.actions[:, write_idx] = write_action
 
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
index 708e20f802..e169ee7296 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
@@ -65,6 +65,7 @@
 from habitat_baselines.utils.common import (
     batch_obs,
     generate_video,
+    get_action_space_info,
     get_num_actions,
     inference_mode,
     is_continuous_action_space,
@@ -346,7 +347,7 @@ def _init_train(self, resume_state=None):
         self._nbuffers = 2 if ppo_cfg.use_double_buffered_sampler else 1
 
         rollouts_cls = baseline_registry.get_storage(
-            self.config.habitat_baselines.rollout_storage
+            self.config.habitat_baselines.rollout_storage_name
         )
         self.rollouts = rollouts_cls(
             ppo_cfg.num_steps,
@@ -938,14 +939,9 @@ def _eval_checkpoint(
         self._init_envs(config, is_eval=True)
 
         self._setup_actor_critic_agent(ppo_cfg)
-        if is_continuous_action_space(self.policy_action_space):
-            # Assume NONE of the actions are discrete
-            action_shape = (get_num_actions(self.policy_action_space),)
-            discrete_actions = False
-        else:
-            # For discrete pointnav
-            action_shape = (1,)
-            discrete_actions = True
+        discrete_actions, action_shape = get_action_space_info(
+            self.policy_action_space
+        )
 
         if self.agent.actor_critic.should_load_agent_state:
             self.agent.load_state_dict(ckpt_dict["state_dict"])
diff --git a/habitat-baselines/habitat_baselines/utils/common.py b/habitat-baselines/habitat_baselines/utils/common.py
index 76e52423d6..4451ec5ae2 100644
--- a/habitat-baselines/habitat_baselines/utils/common.py
+++ b/habitat-baselines/habitat_baselines/utils/common.py
@@ -694,6 +694,23 @@ def is_continuous_action_space(action_space) -> bool:
         )
 
 
+def get_action_space_info(ac_space: spaces.Space) -> Tuple[Tuple[int], bool]:
+    """
+    :returns: The shape of the action space and if the action space is discrete. If the action space is discrete, the shape will be `(1,)`.
+    """
+    if is_continuous_action_space(ac_space):
+        # Assume NONE of the actions are discrete
+        return (
+            get_num_actions(
+                ac_space,
+            ),
+            False,
+        )
+    else:
+        # For discrete pointnav
+        return (1,), True
+
+
 def get_num_actions(action_space) -> int:
     num_actions = 0
     for v in iterate_action_space_recursively(action_space):

From f2c673148e5c2e17655ac8a0b79485078ff8da15 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Tue, 7 Feb 2023 17:53:02 -0800
Subject: [PATCH 42/48] Addressed PR comments

---
 examples/tutorials/colabs/Habitat2_Quickstart.ipynb       | 8 ++++++--
 examples/tutorials/colabs/Habitat_Lab.ipynb               | 8 ++++++--
 .../colabs/Habitat_Lab_TopdownMap_Visualization.ipynb     | 4 +++-
 examples/tutorials/nb_python/Habitat2_Quickstart.py       | 1 +
 examples/tutorials/nb_python/Habitat_Lab.py               | 1 +
 .../nb_python/Habitat_Lab_TopdownMap_Visualization.py     | 1 +
 .../habitat_baselines/common/rollout_storage.py           | 5 +----
 .../config/rearrange/rl_hierarchical.yaml                 | 2 +-
 .../habitat_baselines/rl/hrl/hl/high_level_policy.py      | 2 --
 habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py     | 7 +------
 .../habitat_baselines/rl/hrl/skills/oracle_nav.py         | 2 +-
 .../habitat_baselines/rl/hrl/skills/skill.py              | 2 +-
 habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py | 1 -
 habitat-baselines/habitat_baselines/utils/common.py       | 6 ++++--
 14 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/examples/tutorials/colabs/Habitat2_Quickstart.ipynb b/examples/tutorials/colabs/Habitat2_Quickstart.ipynb
index 3291ab5be8..15bde460af 100644
--- a/examples/tutorials/colabs/Habitat2_Quickstart.ipynb
+++ b/examples/tutorials/colabs/Habitat2_Quickstart.ipynb
@@ -188,7 +188,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "outputs": [],
    "source": [
     "env = gym.make(\"HabitatRenderPick-v0\")\n",
@@ -209,7 +211,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "source": [
     "# Defining New Tasks\n",
     "\n",
diff --git a/examples/tutorials/colabs/Habitat_Lab.ipynb b/examples/tutorials/colabs/Habitat_Lab.ipynb
index 8d94345475..ea1fd2c0b4 100644
--- a/examples/tutorials/colabs/Habitat_Lab.ipynb
+++ b/examples/tutorials/colabs/Habitat_Lab.ipynb
@@ -354,7 +354,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "outputs": [],
    "source": [
     "    action = None\n",
@@ -387,7 +389,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "source": [
     "## Create a new Sensor"
    ]
diff --git a/examples/tutorials/colabs/Habitat_Lab_TopdownMap_Visualization.ipynb b/examples/tutorials/colabs/Habitat_Lab_TopdownMap_Visualization.ipynb
index ca30a63c24..2978c91f1c 100644
--- a/examples/tutorials/colabs/Habitat_Lab_TopdownMap_Visualization.ipynb
+++ b/examples/tutorials/colabs/Habitat_Lab_TopdownMap_Visualization.ipynb
@@ -47,7 +47,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "outputs": [],
    "source": [
     "# [setup]\n",
diff --git a/examples/tutorials/nb_python/Habitat2_Quickstart.py b/examples/tutorials/nb_python/Habitat2_Quickstart.py
index 5eafa8de88..7a45d1c819 100644
--- a/examples/tutorials/nb_python/Habitat2_Quickstart.py
+++ b/examples/tutorials/nb_python/Habitat2_Quickstart.py
@@ -187,6 +187,7 @@ def insert_render_options(config):
 # For other examples of task, sensor, and measurement definitions, [see here
 # for existing tasks](https://github.com/facebookresearch/habitat-lab/tree/main/habitat-lab/habitat/tasks/rearrange/sub_tasks). Tasks, sensors, and measurements are connected through a config file that defines the task.
 
+
 # %%
 @registry.register_task(name="RearrangeDemoNavPickTask-v0")
 class NavPickTaskV1(RearrangeTask):
diff --git a/examples/tutorials/nb_python/Habitat_Lab.py b/examples/tutorials/nb_python/Habitat_Lab.py
index b28c87776e..1df88c4bd1 100644
--- a/examples/tutorials/nb_python/Habitat_Lab.py
+++ b/examples/tutorials/nb_python/Habitat_Lab.py
@@ -306,6 +306,7 @@ def _check_episode_is_active(self, *args, **kwargs):
 # %% [markdown]
 # ## Create a new Sensor
 
+
 # %%
 @registry.register_sensor(name="agent_position_sensor")
 class AgentPositionSensor(habitat.Sensor):
diff --git a/examples/tutorials/nb_python/Habitat_Lab_TopdownMap_Visualization.py b/examples/tutorials/nb_python/Habitat_Lab_TopdownMap_Visualization.py
index 3a39dc12f5..40ef559a25 100644
--- a/examples/tutorials/nb_python/Habitat_Lab_TopdownMap_Visualization.py
+++ b/examples/tutorials/nb_python/Habitat_Lab_TopdownMap_Visualization.py
@@ -86,6 +86,7 @@
     os.makedirs(output_path)
 # [/setup]
 
+
 # %%
 # [example_1]
 def example_pointnav_draw_target_birdseye_view():
diff --git a/habitat-baselines/habitat_baselines/common/rollout_storage.py b/habitat-baselines/habitat_baselines/common/rollout_storage.py
index f48bd71042..afda49835f 100644
--- a/habitat-baselines/habitat_baselines/common/rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/common/rollout_storage.py
@@ -16,10 +16,7 @@
     build_pack_info_from_dones,
     build_rnn_build_seq_info,
 )
-from habitat_baselines.utils.common import (
-    get_action_space_info,
-    get_num_actions,
-)
+from habitat_baselines.utils.common import get_action_space_info
 
 
 @baseline_registry.register_storage
diff --git a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
index f7f142a8a0..f5971142d4 100644
--- a/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
+++ b/habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml
@@ -17,7 +17,7 @@ habitat_baselines:
   verbose: False
   trainer_name: "ddppo"
   updater_name: "HRLPPO"
-  updater_name: "HRLDDPPO"
+  distrib_updater_name: "HRLDDPPO"
   torch_gpu_id: 0
   video_fps: 30
   eval_ckpt_path_dir: ""
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
index ffc2874240..ec12f0ebc1 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hl/high_level_policy.py
@@ -95,8 +95,6 @@ def apply_mask(self, mask: torch.Tensor) -> None:
         Called before every step with the mask information at the current step.
         """
 
-        pass
-
     def get_policy_components(self) -> List[nn.Module]:
         """
         Gets the torch modules that are in the HL policy architecture.
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
index 998933c459..4acad90588 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
@@ -28,12 +28,7 @@ def reduce_loss(loss):
 
         self._set_grads_to_none()
 
-        (
-            values,
-            action_log_probs,
-            dist_entropy,
-            _,
-        ) = self._evaluate_actions(
+        (values, action_log_probs, dist_entropy, _,) = self._evaluate_actions(
             batch["observations"],
             batch["recurrent_hidden_states"],
             batch["prev_actions"],
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
index fe1364ae51..369bdf4e05 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/oracle_nav.py
@@ -121,7 +121,7 @@ def _is_skill_done(
         for i, batch_i in enumerate(batch_idx):
             prev_pos = self._prev_pos[batch_i]
             if prev_pos is not None:
-                movement = torch.linalg.norm(prev_pos - cur_pos[i])
+                movement = (prev_pos - cur_pos[i]).pow(2).sum().sqrt()
                 ret[i] = movement < self._config.stop_thresh
             self._prev_pos[batch_i] = cur_pos[i]
 
diff --git a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
index 9a1eb5647e..e373f83378 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/skills/skill.py
@@ -29,7 +29,7 @@ def __init__(
         """
         self._config = config
         self._batch_size = batch_size
-        self._apply_postcond = self._config.apply_postconds
+        self._apply_postconds = self._config.apply_postconds
         self._force_end_on_timeout = self._config.force_end_on_timeout
         self._max_skill_steps = self._config.max_skill_steps
 
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
index e169ee7296..910ef21374 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
@@ -66,7 +66,6 @@
     batch_obs,
     generate_video,
     get_action_space_info,
-    get_num_actions,
     inference_mode,
     is_continuous_action_space,
 )
diff --git a/habitat-baselines/habitat_baselines/utils/common.py b/habitat-baselines/habitat_baselines/utils/common.py
index 4451ec5ae2..2e8570de95 100644
--- a/habitat-baselines/habitat_baselines/utils/common.py
+++ b/habitat-baselines/habitat_baselines/utils/common.py
@@ -701,8 +701,10 @@ def get_action_space_info(ac_space: spaces.Space) -> Tuple[Tuple[int], bool]:
     if is_continuous_action_space(ac_space):
         # Assume NONE of the actions are discrete
         return (
-            get_num_actions(
-                ac_space,
+            (
+                get_num_actions(
+                    ac_space,
+                ),
             ),
             False,
         )

From 11e77c326f40232c9ff7ff308e459d2fb85cd92e Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Tue, 7 Feb 2023 18:08:43 -0800
Subject: [PATCH 43/48] Fixed config

---
 .../habitat_baselines/config/default_structured_configs.py     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/habitat-baselines/habitat_baselines/config/default_structured_configs.py b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
index 29a6ad3a3f..f90cd5f32c 100644
--- a/habitat-baselines/habitat_baselines/config/default_structured_configs.py
+++ b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
@@ -214,6 +214,7 @@ class Eq2CubeConfig(ObsTransformConfig):
     node=Eq2CubeConfig,
 )
 
+
 @attr.s(auto_attribs=True, slots=True)
 class HrlDefinedSkillConfig(HabitatBaselinesBaseConfig):
     """
@@ -249,7 +250,7 @@ class HrlDefinedSkillConfig(HabitatBaselinesBaseConfig):
 
 
 @attr.s(auto_attribs=True, slots=True)
-class HierarchicalPolicy(HabitatBaselinesBaseConfig):
+class HierarchicalPolicyConfig(HabitatBaselinesBaseConfig):
     high_level_policy: Dict[str, Any] = MISSING
     defined_skills: Dict[str, HrlDefinedSkillConfig] = dict()
     use_skills: Dict[str, str] = dict()

From 6f5ea767c5bb5bc8412bc850a7a5539d76dba516 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Tue, 7 Feb 2023 18:09:36 -0800
Subject: [PATCH 44/48] Fixed typo

---
 habitat-baselines/habitat_baselines/common/rollout_storage.py | 2 +-
 habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/common/rollout_storage.py b/habitat-baselines/habitat_baselines/common/rollout_storage.py
index afda49835f..6fe9a03fc3 100644
--- a/habitat-baselines/habitat_baselines/common/rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/common/rollout_storage.py
@@ -33,7 +33,7 @@ def __init__(
         num_recurrent_layers=1,
         is_double_buffered: bool = False,
     ):
-        discrete_actions, action_shape = get_action_space_info(action_space)
+        action_shape, discrete_actions = get_action_space_info(action_space)
 
         self.buffers = TensorDict()
         self.buffers["observations"] = TensorDict()
diff --git a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
index 910ef21374..dcc4289cb6 100644
--- a/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
+++ b/habitat-baselines/habitat_baselines/rl/ppo/ppo_trainer.py
@@ -938,7 +938,7 @@ def _eval_checkpoint(
         self._init_envs(config, is_eval=True)
 
         self._setup_actor_critic_agent(ppo_cfg)
-        discrete_actions, action_shape = get_action_space_info(
+        action_shape, discrete_actions = get_action_space_info(
             self.policy_action_space
         )
 

From cb6ce62e6c72e58160fe967a5db0b87764d09c3a Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Tue, 7 Feb 2023 18:12:01 -0800
Subject: [PATCH 45/48] Fixed another typo

---
 habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
index 4acad90588..b7bbd38bc2 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_ppo.py
@@ -28,7 +28,13 @@ def reduce_loss(loss):
 
         self._set_grads_to_none()
 
-        (values, action_log_probs, dist_entropy, _,) = self._evaluate_actions(
+        (
+            values,
+            action_log_probs,
+            dist_entropy,
+            _,
+            _,
+        ) = self._evaluate_actions(
             batch["observations"],
             batch["recurrent_hidden_states"],
             batch["prev_actions"],

From 6d4b968bd17e6a0edfd9b95d2741d6c8bf83a1e4 Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Tue, 7 Feb 2023 18:13:30 -0800
Subject: [PATCH 46/48] CI

---
 .../habitat_baselines/config/default_structured_configs.py       | 1 -
 habitat-lab/habitat/config/default_structured_configs.py         | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/habitat-baselines/habitat_baselines/config/default_structured_configs.py b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
index f90cd5f32c..6e540e627c 100644
--- a/habitat-baselines/habitat_baselines/config/default_structured_configs.py
+++ b/habitat-baselines/habitat_baselines/config/default_structured_configs.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple
 
 import attr
diff --git a/habitat-lab/habitat/config/default_structured_configs.py b/habitat-lab/habitat/config/default_structured_configs.py
index 1a6fc472d5..c0638a849c 100644
--- a/habitat-lab/habitat/config/default_structured_configs.py
+++ b/habitat-lab/habitat/config/default_structured_configs.py
@@ -200,6 +200,7 @@ class RearrangeStopActionConfig(ActionConfig):
 class PddlApplyActionConfig(ActionConfig):
     type: str = "PddlApplyAction"
 
+
 @attr.s(auto_attribs=True, slots=True)
 class OracleNavActionConfig(ActionConfig):
     """

From 9d2c2f5a99a64d72a9a78a2a0f2157bde0e37cfb Mon Sep 17 00:00:00 2001
From: ASzot <ASzot@users.noreply.github.com>
Date: Wed, 8 Feb 2023 22:32:15 -0800
Subject: [PATCH 47/48] Updated to work with older pytorch version

---
 .../habitat_baselines/rl/hrl/hrl_rollout_storage.py           | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
index 26dbbe9afc..9acf0854ea 100644
--- a/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
+++ b/habitat-baselines/habitat_baselines/rl/hrl/hrl_rollout_storage.py
@@ -95,9 +95,7 @@ def insert(
         env_idxs = torch.arange(self._num_envs)
         if rewards is not None:
             # Accumulate rewards between updates.
-            reward_write_idxs = torch.maximum(
-                self._cur_step_idxs - 1, torch.zeros_like(self._cur_step_idxs)
-            )
+            reward_write_idxs = torch.clamp(self._cur_step_idxs - 1, min=0)
             self.buffers["rewards"][reward_write_idxs, env_idxs] += rewards
 
         if len(next_step) > 0:

From 48142fb9c152392e8cfe388a60e2d14b869e0fbd Mon Sep 17 00:00:00 2001
From: vincentpierre <vincentpierre@users.noreply.github.com>
Date: Thu, 9 Feb 2023 13:18:06 -0800
Subject: [PATCH 48/48] renaming --exp-config to --config-name again

---
 habitat-baselines/habitat_baselines/README.md | 7 +++----
 scripts/generate_profile_shell_scripts.py     | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/habitat-baselines/habitat_baselines/README.md b/habitat-baselines/habitat_baselines/README.md
index 8451a4b4a5..8c04a8ef2a 100644
--- a/habitat-baselines/habitat_baselines/README.md
+++ b/habitat-baselines/habitat_baselines/README.md
@@ -61,15 +61,14 @@ To train a high-level policy, while using pre-learned low-level skills (SRL base
 
 ```bash
 python -u habitat-baselines/habitat_baselines/run.py \
-  --exp-config habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml \
-  --run-type train
+  --config-name=rearrange/rl_hierarchical.yaml
 ```
 To run a rearrangement episode with oracle low-level skills and a fixed task planner, run:
 
 ```bash
 python -u habitat-baselines/habitat_baselines/run.py \
-  --exp-config habitat-baselines/habitat_baselines/config/rearrange/rl_hierarchical.yaml \
-  --run-type eval \
+  --config-name=rearrange/rl_hierarchical.yaml \
+  habitat_baselines.evaluate=True \
   habitat_baselines/rl/policy=hl_fixed \
   habitat_baselines/rl/policy/hierarchical_policy/defined_skills=oracle_skills
 ```
diff --git a/scripts/generate_profile_shell_scripts.py b/scripts/generate_profile_shell_scripts.py
index 030bcf29ec..b61728e29a 100644
--- a/scripts/generate_profile_shell_scripts.py
+++ b/scripts/generate_profile_shell_scripts.py
@@ -22,7 +22,7 @@
 if __name__ == "__main__":
     # The Habitat-lab program to be profiled (the command you usually use to
     # invoke it).
-    program_str = "python -u -m habitat_baselines.run --exp-config habitat-baselines/habitat_baselines/config/pointnav/ddppo_pointnav.yaml --run-type train"
+    program_str = "python -u -m habitat_baselines.run --config-name=pointnav/ddppo_pointnav.yaml"
 
     # Path to Nsight Systems nsys command-line tool. This hard-coded path is
     # for the FAIR cluster.