EdanToledo · smorad · Jun 16, 2024 · Jun 21, 2024 · Jun 21, 2024 · Jun 22, 2024
diff --git a/stoix/configs/arch/anakin.yaml b/stoix/configs/arch/anakin.yaml
@@ -9,10 +9,10 @@ total_timesteps: 1e7 # Set the total environment steps.
 num_updates: ~ # Number of updates
 
 # --- Evaluation ---
-evaluation_greedy: False # Evaluate the policy greedily. If True the policy will select
+evaluation_greedy: True # Evaluate the policy greedily. If True the policy will select
   # an action which corresponds to the greatest logit. If false, the policy will sample
   # from the logits.
 num_eval_episodes: 128 # Number of episodes to evaluate per evaluation.
-num_evaluation: 50 # Number of evenly spaced evaluations to perform during training.
+num_evaluation: 19 # Number of evenly spaced evaluations to perform during training.
 absolute_metric: True # Whether the absolute metric should be computed. For more details
   # on the absolute metric please see: https://arxiv.org/abs/2209.10485
diff --git a/stoix/configs/default_rec_ppo.yaml b/stoix/configs/default_rec_ppo.yaml
@@ -2,6 +2,6 @@ defaults:
   - logger: base_logger
   - arch: anakin
   - system: rec_ppo
-  - network: rnn
-  - env: gymnax/cartpole
+  - network: memoroid
+  - env: popjym/repeat_first_easy
   - _self_
diff --git a/stoix/configs/env/popjym/auto_encode_easy.yaml b/stoix/configs/env/popjym/auto_encode_easy.yaml
@@ -0,0 +1,12 @@
+# ---Environment Configs---
+env_name: popjym
+
+scenario:
+  name: AutoencodeEasy
+  task_name: auto_encode_easy
+
+kwargs: {}
+
+# Defines the metric that will be used to evaluate the performance of the agent.
+# This metric is returned at the end of an experiment and can be used for hyperparameter tuning.
+eval_metric: episode_return
diff --git a/stoix/configs/env/popjym/auto_encode_medium.yaml b/stoix/configs/env/popjym/auto_encode_medium.yaml
@@ -0,0 +1,12 @@
+# ---Environment Configs---
+env_name: popjym
+
+scenario:
+  name: AutoencodeMedium
+  task_name: auto_encode_medium
+
+kwargs: {}
+
+# Defines the metric that will be used to evaluate the performance of the agent.
+# This metric is returned at the end of an experiment and can be used for hyperparameter tuning.
+eval_metric: episode_return
diff --git a/stoix/configs/env/popjym/count_recall_easy.yaml b/stoix/configs/env/popjym/count_recall_easy.yaml
@@ -0,0 +1,12 @@
+# ---Environment Configs---
+env_name: popjym
+
+scenario:
+  name: CountRecallEasy
+  task_name: count_recall_easy
+
+kwargs: {}
+
+# Defines the metric that will be used to evaluate the performance of the agent.
+# This metric is returned at the end of an experiment and can be used for hyperparameter tuning.
+eval_metric: episode_return
diff --git a/stoix/configs/env/popjym/count_recall_medium.yaml b/stoix/configs/env/popjym/count_recall_medium.yaml
@@ -0,0 +1,12 @@
+# ---Environment Configs---
+env_name: popjym
+
+scenario:
+  name: CountRecallMedium
+  task_name: count_recall_medium
+
+kwargs: {}
+
+# Defines the metric that will be used to evaluate the performance of the agent.
+# This metric is returned at the end of an experiment and can be used for hyperparameter tuning.
+eval_metric: episode_return
diff --git a/stoix/configs/env/popjym/repeat_first_easy.yaml b/stoix/configs/env/popjym/repeat_first_easy.yaml
@@ -0,0 +1,12 @@
+# ---Environment Configs---
+env_name: popjym
+
+scenario:
+  name: RepeatFirstEasy
+  task_name: repeat_first_easy
+
+kwargs: {}
+
+# Defines the metric that will be used to evaluate the performance of the agent.
+# This metric is returned at the end of an experiment and can be used for hyperparameter tuning.
+eval_metric: episode_return
diff --git a/stoix/configs/env/popjym/repeat_first_hard.yaml b/stoix/configs/env/popjym/repeat_first_hard.yaml
@@ -0,0 +1,12 @@
+# ---Environment Configs---
+env_name: popjym
+
+scenario:
+  name: RepeatFirstHard
+  task_name: repeat_first_hard
+
+kwargs: {}
+
+# Defines the metric that will be used to evaluate the performance of the agent.
+# This metric is returned at the end of an experiment and can be used for hyperparameter tuning.
+eval_metric: episode_return
diff --git a/stoix/configs/env/popjym/repeat_first_medium.yaml b/stoix/configs/env/popjym/repeat_first_medium.yaml
@@ -0,0 +1,12 @@
+# ---Environment Configs---
+env_name: popjym
+
+scenario:
+  name: RepeatFirstMedium
+  task_name: repeat_first_medium
+
+kwargs: {}
+
+# Defines the metric that will be used to evaluate the performance of the agent.
+# This metric is returned at the end of an experiment and can be used for hyperparameter tuning.
+eval_metric: episode_return
diff --git a/stoix/configs/env/popjym/stateless_cartpole_easy.yaml b/stoix/configs/env/popjym/stateless_cartpole_easy.yaml
@@ -6,3 +6,7 @@ scenario:
   task_name: stateless_cartpole_easy
 
 kwargs: {}
+
+# Defines the metric that will be used to evaluate the performance of the agent.
+# This metric is returned at the end of an experiment and can be used for hyperparameter tuning.
+eval_metric: episode_return
diff --git a/stoix/configs/logger/base_logger.yaml b/stoix/configs/logger/base_logger.yaml
@@ -4,12 +4,12 @@ base_exp_path: results # Base path for logging.
 use_console: True # Whether to log to stdout.
 use_tb: False # Whether to use tensorboard logging.
 use_json: False # Whether to log marl-eval style to json files.
-use_neptune: False  # Whether to log to neptune.ai.
+use_neptune: True  # Whether to log to neptune.ai.
 use_wandb: False  # Whether to log to wandb.ai.
 
 # --- Other logger kwargs ---
 kwargs:
-  project: ~  # Project name in neptune.ai or wandb.ai.
+  project: e.toledo/Stoix  # Project name in neptune.ai or wandb.ai.
   tags: [stoix] # Tags to add to the experiment.
   detailed_logging: False  # having mean/std/min/max can clutter neptune/wandb so we make it optional
   json_path: ~ # If set, json files will be logged to a set path so that multiple experiments can

diff --git a/stoix/configs/network/muzero.yaml b/stoix/configs/network/muzero.yaml
@@ -30,8 +30,7 @@ wm_network:
     activation: silu
 
   # This can be seen as the dyanmics network.
-  rnn_size: 256
-  num_stacked_rnn_layers: 2
+  rnn_sizes: [256, 256]
   rnn_cell_type: "gru"
   recurrent_activation: "sigmoid"
 

diff --git a/stoix/configs/network/rnn.yaml b/stoix/configs/network/rnn.yaml
@@ -3,35 +3,35 @@
 actor_network:
   pre_torso:
     _target_: stoix.networks.torso.MLPTorso
-    layer_sizes: [128]
-    use_layer_norm: False
-    activation: silu
+    layer_sizes: [256]
+    use_layer_norm: True
+    activation: leaky_relu
   rnn_layer:
-    _target_: stoix.networks.base.ScannedRNN
+    _target_: stoix.networks.recurrent.ScannedRNN
     cell_type: gru
-    hidden_state_dim: 128
+    hidden_state_dim: 256
   post_torso:
     _target_: stoix.networks.torso.MLPTorso
-    layer_sizes: [128]
+    layer_sizes: [256]
     use_layer_norm: False
-    activation: silu
+    activation: leaky_relu
   action_head:
     _target_: stoix.networks.heads.CategoricalHead
 
 critic_network:
   pre_torso:
     _target_: stoix.networks.torso.MLPTorso
-    layer_sizes: [128]
-    use_layer_norm: False
-    activation: silu
+    layer_sizes: [256]
+    use_layer_norm: True
+    activation: leaky_relu
   rnn_layer:
-    _target_: stoix.networks.base.ScannedRNN
+    _target_: stoix.networks.recurrent.ScannedRNN
     cell_type: gru
-    hidden_state_dim: 128
+    hidden_state_dim: 256
   post_torso:
     _target_: stoix.networks.torso.MLPTorso
-    layer_sizes: [128]
+    layer_sizes: [256]
     use_layer_norm: False
-    activation: silu
+    activation: leaky_relu
   critic_head:
     _target_: stoix.networks.heads.ScalarCriticHead
diff --git a/stoix/configs/network/stacked_lrm.yaml b/stoix/configs/network/stacked_lrm.yaml
@@ -0,0 +1,41 @@
+# ---Recurrent Structure Networks for PPO ---
+
+actor_network:
+  pre_torso:
+    _target_: stoix.networks.torso.MLPTorso
+    layer_sizes: [256]
+    use_layer_norm: False
+    activation: leaky_relu
+  rnn_layer:
+    _target_: stoix.networks.lrm.shared.StackedLRM
+    num_cells: 2
+    lrm_cell_type: lru
+    cell_kwargs:
+      hidden_state_dim: 256
+  post_torso:
+    _target_: stoix.networks.torso.MLPTorso
+    layer_sizes: [256]
+    use_layer_norm: False
+    activation: leaky_relu
+  action_head:
+    _target_: stoix.networks.heads.CategoricalHead
+
+critic_network:
+  pre_torso:
+    _target_: stoix.networks.torso.MLPTorso
+    layer_sizes: [256]
+    use_layer_norm: False
+    activation: leaky_relu
+  rnn_layer:
+    _target_: stoix.networks.lrm.shared.StackedLRM
+    num_cells: 2
+    lrm_cell_type: lru
+    cell_kwargs:
+      hidden_state_dim: 256
+  post_torso:
+    _target_: stoix.networks.torso.MLPTorso
+    layer_sizes: [256]
+    use_layer_norm: False
+    activation: leaky_relu
+  critic_head:
+    _target_: stoix.networks.heads.ScalarCriticHead
diff --git a/stoix/configs/system/rec_ppo.yaml b/stoix/configs/system/rec_ppo.yaml
@@ -3,18 +3,18 @@
 system_name: rec_ppo # Name of the system.
 
 # --- RL hyperparameters ---
-actor_lr: 2.5e-4 # Learning rate for actor network
-critic_lr: 2.5e-4 # Learning rate for critic network
-rollout_length: 128 # Number of environment steps per vectorised environment.
-epochs: 4 # Number of ppo epochs per training data batch.
-num_minibatches: 2 # Number of minibatches per ppo epoch.
+actor_lr: 3e-5 # Learning rate for actor network
+critic_lr: 3e-5 # Learning rate for critic network
+rollout_length: 64 # Number of environment steps per vectorised environment.
+epochs: 10 # Number of ppo epochs per training data batch.
+num_minibatches: 64 # Number of minibatches per ppo epoch.
 gamma: 0.99 # Discounting factor.
 gae_lambda: 0.95 # Lambda value for GAE computation.
 clip_eps: 0.2 # Clipping value for PPO updates and value function.
 ent_coef: 0.01 # Entropy regularisation term for loss function.
 vf_coef: 0.5 # Critic weight in
 max_grad_norm: 0.5 # Maximum norm of the gradients for a weight update.
-decay_learning_rates: False # Whether learning rates should be linearly decayed during training.
+decay_learning_rates: True # Whether learning rates should be linearly decayed during training.
 standardize_advantages: True # Whether to standardize the advantages.
 
 # --- Recurrent hyperparameters ---

diff --git a/stoix/networks/base.py b/stoix/networks/base.py
@@ -1,16 +1,12 @@
-import functools
 from typing import Sequence, Tuple, Union
 
 import chex
 import distrax
-import jax
 import jax.numpy as jnp
-import numpy as np
 from flax import linen as nn
 
 from stoix.base_types import Observation, RNNObservation
 from stoix.networks.inputs import ObservationInput
-from stoix.networks.utils import parse_rnn_cell
 
 
 class FeedForwardActor(nn.Module):
@@ -83,51 +79,12 @@ def __call__(
         return concatenated
 
 
-class ScannedRNN(nn.Module):
-    hidden_state_dim: int
-    cell_type: str
-
-    @functools.partial(
-        nn.scan,
-        variable_broadcast="params",
-        in_axes=0,
-        out_axes=0,
-        split_rngs={"params": False},
-    )
-    @nn.compact
-    def __call__(self, rnn_state: chex.Array, x: chex.Array) -> Tuple[chex.Array, chex.Array]:
-        """Applies the module."""
-        ins, resets = x
-        hidden_state_reset_fn = lambda reset_state, current_state: jnp.where(
-            resets[:, np.newaxis],
-            reset_state,
-            current_state,
-        )
-        rnn_state = jax.tree_util.tree_map(
-            hidden_state_reset_fn,
-            self.initialize_carry(ins.shape[0]),
-            rnn_state,
-        )
-        new_rnn_state, y = parse_rnn_cell(self.cell_type)(features=self.hidden_state_dim)(
-            rnn_state, ins
-        )
-        return new_rnn_state, y
-
-    @nn.nowrap
-    def initialize_carry(self, batch_size: int) -> chex.Array:
-        """Initializes the carry state."""
-        # Use a dummy key since the default state init fn is just zeros.
-        cell = parse_rnn_cell(self.cell_type)(features=self.hidden_state_dim)
-        return cell.initialize_carry(jax.random.PRNGKey(0), (batch_size, self.hidden_state_dim))
-
-
 class RecurrentActor(nn.Module):
     """Recurrent Actor Network."""
 
     action_head: nn.Module
     post_torso: nn.Module
-    hidden_state_dim: int
-    cell_type: str
+    rnn: nn.Module
     pre_torso: nn.Module
     input_layer: nn.Module = ObservationInput()
 
@@ -143,9 +100,7 @@ def __call__(
         observation = self.input_layer(observation)
         policy_embedding = self.pre_torso(observation)
         policy_rnn_input = (policy_embedding, done)
-        policy_hidden_state, policy_embedding = ScannedRNN(self.hidden_state_dim, self.cell_type)(
-            policy_hidden_state, policy_rnn_input
-        )
+        policy_hidden_state, policy_embedding = self.rnn(policy_hidden_state, policy_rnn_input)
         actor_logits = self.post_torso(policy_embedding)
         pi = self.action_head(actor_logits)
 
@@ -157,8 +112,7 @@ class RecurrentCritic(nn.Module):
 
     critic_head: nn.Module
     post_torso: nn.Module
-    hidden_state_dim: int
-    cell_type: str
+    rnn: nn.Module
     pre_torso: nn.Module
     input_layer: nn.Module = ObservationInput()
 
@@ -175,9 +129,7 @@ def __call__(
 
         critic_embedding = self.pre_torso(observation)
         critic_rnn_input = (critic_embedding, done)
-        critic_hidden_state, critic_embedding = ScannedRNN(self.hidden_state_dim, self.cell_type)(
-            critic_hidden_state, critic_rnn_input
-        )
+        critic_hidden_state, critic_embedding = self.rnn(critic_hidden_state, critic_rnn_input)
         critic_output = self.post_torso(critic_embedding)
         critic_output = self.critic_head(critic_output)
 

diff --git a/stoix/networks/layers.py b/stoix/networks/layers.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Sequence, Tuple
 
 import chex
 import jax
@@ -24,19 +24,17 @@ class StackedRNN(nn.Module):
         activation_fn (str): The activation function to use in each RNN cell (default is "tanh").
     """
 
-    rnn_size: int
+    rnn_sizes: Sequence[int]
     rnn_cls: nn.Module
-    num_layers: int
     activation_fn: str = "sigmoid"
 
     def setup(self) -> None:
         """Set up the RNN cells for the stacked RNN."""
         self.cells = [
-            self.rnn_cls(
-                features=self.rnn_size, activation_fn=parse_activation_fn(self.activation_fn)
-            )
-            for _ in range(self.num_layers)
+            self.rnn_cls(features=size, activation_fn=parse_activation_fn(self.activation_fn))
+            for size in self.rnn_sizes
         ]
+        self.num_layers = len(self.cells)
 
     def __call__(
         self, all_rnn_states: List[chex.ArrayTree], x: chex.Array