Merge pull request #134 from BDonnot/bd_dev

Adding a few features and fix
Grid2op · May 3, 2021 · f9ed970 · f9ed970
2 parents 2995859 + f755f47
commit f9ed970
Show file tree

Hide file tree

Showing 20 changed files with 243 additions and 35 deletions.
diff --git a/.gitignore b/.gitignore
@@ -305,6 +305,7 @@ test_issue185.py
 test_can_make_opponent.py
 enigma_nili.py
 test_issue196.py
+test_increasingreward.py
 
 # profiling files
 **.prof
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -24,16 +24,25 @@ Change Log
 
 [1.5.2] - 2021-xx-yy
 -----------------------
+- [BREAKING]: allow the opponent to chose the duration of its attack. This breaks the previous "Opponent.attack(...)"
+  signature by adding an object in the return value. All code provided with grid2op are compatible with this
+  new change. (for previously coded opponent, the only thing you have to do to make it compliant with
+  the new interface is, in the `opponent.attack(...)` function return `whatever_you_returned_before, None` instead
+  of simply `whatever_you_returned_before`
 - [FIXED]: `Issue#196 <https://github.com/rte-france/Grid2Op/issues/196>`_ an issue related to the
   low / high of the observation if using the gym_compat module. Some more protections
   are enforced now.
 - [FIXED]: `Issue#196 <https://github.com/rte-france/Grid2Op/issues/196>`_ an issue related the scaling when negative
   numbers are used (in these cases low / max would be mixed up)
+- [FIXED]: an issue with the `IncreasingFlatReward` reward types
+- [ADDED]: a reward `EpisodeDurationReward` that is always 0 unless at the end of an episode where it returns a float
+  proportional to the number of step made from the beginning of the environment.
+- [ADDED]: in the `Observation` the possibility to retrieve the current number of steps
 - [IMPROVED]: on windows at least, grid2op does not work with gym < 0.17.2 Checks are performed in order to make sure
   the installed open ai gym package meets this requirement (see issue
   `Issue#185 <https://github.com/rte-france/Grid2Op/issues/185>`_ )
 - [IMPROVED] the seed of openAI gym for composed action space (see issue `https://github.com/openai/gym/issues/2166`):
-  waiting for an official fix, grid2op will use the solution proposed there
+  in waiting for an official fix, grid2op will use the solution proposed there
   https://github.com/openai/gym/issues/2166#issuecomment-803984619 )
 
 [1.5.1] - 2021-04-15

diff --git a/grid2op/Environment/BaseEnv.py b/grid2op/Environment/BaseEnv.py
@@ -1551,6 +1551,7 @@ def step(self, action):
         lines_attacked, subs_attacked = None, None
         conv_ = None
         init_line_status = copy.deepcopy(self.backend.get_line_status())
+
         beg_step = time.time()
         try:
             beg_ = time.time()

diff --git a/grid2op/Environment/Environment.py b/grid2op/Environment/Environment.py
@@ -244,13 +244,6 @@ def _init_backend(self,
                                                             actionClass=CompleteAction,
                                                             legal_action=self._game_rules.legal_action)
 
-        self._helper_observation_class = ObservationSpace.init_grid(gridobj=bk_type)
-        self._observation_space = self._helper_observation_class(gridobj=bk_type,
-                                                                 observationClass=observationClass,
-                                                                 actionClass=actionClass,
-                                                                 rewardClass=rewardClass,
-                                                                 env=self)
-
         # handles input data
         if not isinstance(chronics_handler, ChronicsHandler):
             raise Grid2OpException(
@@ -263,6 +256,15 @@ def _init_backend(self,
                                          names_chronics_to_backend=names_chronics_to_backend)
         self.names_chronics_to_backend = names_chronics_to_backend
 
+        # this needs to be done after the chronics handler: rewards might need information
+        # about the chronics to work properly.
+        self._helper_observation_class = ObservationSpace.init_grid(gridobj=bk_type)
+        self._observation_space = self._helper_observation_class(gridobj=bk_type,
+                                                                 observationClass=observationClass,
+                                                                 actionClass=actionClass,
+                                                                 rewardClass=rewardClass,
+                                                                 env=self)
+
         # test to make sure the backend is consistent with the chronics generator
         self.chronics_handler.check_validity(self.backend)
         self.delta_time_seconds = dt_float(self.chronics_handler.time_interval.seconds)

diff --git a/grid2op/Observation/BaseObservation.py b/grid2op/Observation/BaseObservation.py
@@ -188,6 +188,9 @@ class BaseObservation(GridObjects):
     curtailment_limit: :class:`numpy.ndarray`, dtype:float
         Limit (in ratio of gen_pmax) imposed on each renewable generator.
 
+    current_step: ``int``
+        Current number of step performed up until this observation (NB this is not given in the observation if
+        it is transformed into a vector)
     """
 
     _attr_eq = ["line_status",
@@ -316,6 +319,9 @@ def __init__(self,
         self.gen_theta = np.empty(shape=self.n_gen, dtype=dt_float)
         self.storage_theta = np.empty(shape=self.n_storage, dtype=dt_float)
 
+        # counter
+        self.current_step = 0
+
     def state_of(self,
                  _sentinel=None,
                  load_id=None,

diff --git a/grid2op/Observation/CompleteObservation.py b/grid2op/Observation/CompleteObservation.py
@@ -130,6 +130,9 @@ def update(self, env, with_forecast=True):
         self._reset_matrices()
         self.reset()
 
+        # counter
+        self.current_step = env.nb_time_step
+
         # extract the time stamps
         self.year = dt_int(env.time_stamp.year)
         self.month = dt_int(env.time_stamp.month)

diff --git a/grid2op/Observation/_ObsEnv.py b/grid2op/Observation/_ObsEnv.py
@@ -123,6 +123,9 @@ def __init__(self,
         self._sum_curtailment_mw_init = 0.
         self._sum_curtailment_mw_prev_init = 0.
 
+        # step count
+        self._nb_time_step_init = 0
+
     def _init_myclass(self):
         """this class has already all the powergrid information: it is initialized in the obs space !"""
         pass
@@ -363,6 +366,9 @@ def _reset_to_orig_state(self):
         self._sum_curtailment_mw = self._sum_curtailment_mw_init
         self._sum_curtailment_mw_prev = self._sum_curtailment_mw_prev_init
 
+        # current step
+        self.nb_time_step = self._nb_time_step_init
+
     def simulate(self, action):
         """
         INTERNAL
@@ -490,5 +496,8 @@ def update_grid(self, env):
         # time delta
         self.delta_time_seconds = env.delta_time_seconds
 
+        # current time
+        self._nb_time_step_init = env.nb_time_step
+
     def get_current_line_status(self):
         return self._line_status == 1
diff --git a/grid2op/Opponent/BaseOpponent.py b/grid2op/Opponent/BaseOpponent.py
@@ -68,10 +68,14 @@ def attack(self, observation, agent_action, env_action, budget, previous_fails):
         -------
         attack: :class:`grid2op.Action.Action`
             The attack performed by the opponent. In this case, a do nothing, all the time.
+
+        duration: ``int``
+            The duration of the attack
+
         """
         # TODO maybe have a class "GymOpponent" where the observation would include the budget  and all other
         # TODO information, and forward something to the "act" method.
-        return None
+        return None, None
 
     def tell_attack_continues(self, observation, agent_action, env_action, budget):
         """

diff --git a/grid2op/Opponent/OpponentSpace.py b/grid2op/Opponent/OpponentSpace.py
@@ -5,13 +5,20 @@
 # you can obtain one at http://mozilla.org/MPL/2.0/.
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of Grid2Op, Grid2Op a testbed platform to model sequential decision making in power systems.
+
+import numpy as np
+
 from grid2op.Exceptions import OpponentError
 
 
 class OpponentSpace(object):
     """
     Is similar to the action space, but for the opponent.
 
+    This class is used to express some "constraints" on the opponent attack. The opponent is free to attack whatever
+    it wants, for how long it wants and when it wants. This class ensures that the opponent does not break any
+    rules.
+
     Attributes
     ----------
     action_space: :class:`grid2op.Action.ActionSpace`
@@ -32,7 +39,12 @@ class OpponentSpace(object):
     budget_per_timestep: ``float``
         The increase of the opponent budget per time step (if any)
     """
-    def __init__(self, compute_budget, init_budget, opponent, attack_duration, attack_cooldown,
+    def __init__(self,
+                 compute_budget,
+                 init_budget,
+                 opponent,
+                 attack_duration,  # maximum duration of an attack
+                 attack_cooldown,  # minimum duration between two consecutive attack
                  budget_per_timestep=0., action_space=None):
         if action_space is not None:
             if not isinstance(action_space, compute_budget.action_space):
@@ -48,7 +60,7 @@ def __init__(self, compute_budget, init_budget, opponent, attack_duration, attac
         self._do_nothing = self.action_space()
         self.previous_fails = False
         self.budget_per_timestep = budget_per_timestep
-        self.attack_duration = attack_duration
+        self.attack_max_duration = attack_duration
         self.attack_cooldown = attack_cooldown
         self.current_attack_duration = 0
         self.current_attack_cooldown = attack_cooldown
@@ -150,32 +162,42 @@ def attack(self, observation, agent_action, env_action):
         self.current_attack_cooldown = max(0, self.current_attack_cooldown - 1)
         attack_called = False
 
-        # If currently attacking
         if self.current_attack_duration > 0:
+            # previous attack is not over
             attack = self.last_attack
 
-        # If the opponent has already attacked today
         elif self.current_attack_cooldown > self.attack_cooldown:
+            # minimum time between two consecutive attack not met
             attack = None
 
         # If the opponent can attack  
         else:
             self.previous_fails = False
-            attack = self.opponent.attack(observation, agent_action, env_action, self.budget,
-                                          self.previous_fails)
             attack_called = True
+            attack, duration = self.opponent.attack(observation, agent_action, env_action, self.budget,
+                                                    self.previous_fails)
+            if duration is None:
+                if np.isfinite(self.attack_max_duration):
+                    duration = self.attack_max_duration
+                else:
+                    duration = 1
+
+            if duration > self.attack_max_duration:
+                # duration chosen by the opponent would exceed the maximum duration allowed
+                attack = None
+
             # If the cost is too high
             final_budget = self.budget  # TODO add the: + self.budget_per_timestep * (self.attack_duration - 1)
             # i did not do it in case an attack is ok at the beginning, ok at the end, but at some point in the attack
             # process it is not (but i'm not sure this can happen, and don't have time to think about it right now)
-            if self.attack_duration * self.compute_budget(attack) > final_budget:
+            if duration * self.compute_budget(attack) > final_budget:
                 attack = None
                 self.previous_fails = True
 
             # If we can afford the attack
             elif attack is not None:
-                # even if it's "do nothing", it's sill an attack. To bad if the opponent chose to do nothing.
-                self.current_attack_duration = self.attack_duration
+                # even if it's "do nothing", it's sill an attack. Too bad if the opponent chose to do nothing.
+                self.current_attack_duration = duration
                 self.current_attack_cooldown += self.attack_cooldown
 
         if not attack_called:

diff --git a/grid2op/Opponent/RandomLineOpponent.py b/grid2op/Opponent/RandomLineOpponent.py
@@ -108,20 +108,23 @@ def attack(self, observation, agent_action, env_action,
         -------
         attack: :class:`grid2op.Action.Action`
             The attack performed by the opponent. In this case, a do nothing, all the time.
+
+        duration: ``int``
+            The duration of the attack (if ``None`` then the attack will be made for the longest allowed time)
         """
         # TODO maybe have a class "GymOpponent" where the observation would include the budget  and all other
         # TODO information, and forward something to the "act" method.
 
         if observation is None:  # during creation of the environment
-            return None  # i choose not to attack in this case
+            return None, 0  # i choose not to attack in this case
 
         # Status of attackable lines
         status = observation.line_status[self._lines_ids]
 
         # If all attackable lines are disconnected
         if np.all(~status):
-            return None  # i choose not to attack in this case
+            return None, 0  # i choose not to attack in this case
 
         # Pick a line among the connected lines
         attack = self.space_prng.choice(self._attacks[status])
-        return attack
+        return attack, None
diff --git a/grid2op/Opponent/WeightedRandomOpponent.py b/grid2op/Opponent/WeightedRandomOpponent.py
@@ -135,13 +135,16 @@ def attack(self, observation, agent_action, env_action,
         -------
         attack: :class:`grid2op.Action.Action`
             The attack performed by the opponent. In this case, a do nothing, all the time.
+
+        duration: ``int``
+            The duration of the attack
         """
         # TODO maybe have a class "GymOpponent" where the observation would include the budget  and all other
         # TODO information, and forward something to the "act" method.
 
         # During creation of the environment, do not attack
         if observation is None:
-            return None
+            return None, 0
 
         # Decide the time of the next attack
         if self._next_attack_time is None:
@@ -150,16 +153,17 @@ def attack(self, observation, agent_action, env_action,
 
         # If the attack time has not come yet, do not attack
         if self._next_attack_time > 0:
-            return None
+            return None, 0
 
         # If all attackable lines are disconnected, do not attack
         status = observation.line_status[self._lines_ids]
         if np.all(~status):
-            return None
+            return None, 0
+
         available_attacks = self._attacks[status]
         rho = observation.rho[self._lines_ids][status] / self._rho_normalization[status]
         rho_sum = rho.sum()
         if rho_sum <= 0.:
             return None
         attack = self.space_prng.choice(available_attacks, p=rho / rho_sum)
-        return attack
+        return attack, None
diff --git a/grid2op/Reward/EpisodeDurationReward.py b/grid2op/Reward/EpisodeDurationReward.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2019-2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of Grid2Op, Grid2Op a testbed platform to model sequential decision making in power systems.
+
+import numpy as np
+from grid2op.Reward.BaseReward import BaseReward
+from grid2op.dtypes import dt_float
+
+
+class EpisodeDurationReward(BaseReward):
+    """
+    This reward will always be 0., unless at the end of an episode where it will return the number
+    of steps made by the agent divided by the total number of steps possible in the episode.
+
+    Examples
+    ---------
+    You can use this reward in any environment with:
+
+    .. code-block:
+
+        import grid2op
+        from grid2op.Reward import EpisodeDurationReward
+
+        # then you create your environment with it:
+        NAME_OF_THE_ENVIRONMENT = "rte_case14_realistic"
+        env = grid2op.make(NAME_OF_THE_ENVIRONMENT,reward_class=EpisodeDurationReward)
+        # and do a step with a "do nothing" action
+        obs = env.reset()
+        obs, reward, done, info = env.step(env.action_space())
+        # the reward is computed with the EpisodeDurationReward class
+
+    Notes
+    -----
+    In case of an environment being "fast forward" (see :func:`grid2op.Environment.BaseEnv.fast_forward_chronics`)
+    the time "during" the fast forward are counted "as if" they were successful.
+
+    This means that if you "fast forward" up until the end of an episode, you are likely to receive a reward of 1.0
+
+
+    """
+    def __init__(self, per_timestep=1):
+        BaseReward.__init__(self)
+        self.per_timestep = dt_float(per_timestep)
+        self.total_time_steps = dt_float(0.0)
+        self.reward_min = dt_float(0.)
+        self.reward_max = dt_float(1.)
+
+    def initialize(self, env):
+        if env.chronics_handler.max_timestep() > 0:
+            self.total_time_steps = env.chronics_handler.max_timestep() * self.per_timestep
+        else:
+            self.total_time_steps = np.inf
+            self.reward_max = np.inf
+
+    def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
+        if is_done:
+            res = env.nb_time_step
+            if np.isfinite(self.total_time_steps):
+                res /= self.total_time_steps
+        else:
+            res = self.reward_min
+        return res
diff --git a/grid2op/Reward/IncreasingFlatReward.py b/grid2op/Reward/IncreasingFlatReward.py
@@ -38,7 +38,6 @@ class IncreasingFlatReward(BaseReward):
     def __init__(self, per_timestep=1):
         BaseReward.__init__(self)
         self.per_timestep = dt_float(per_timestep)
-        self.total_reward = dt_float(0.0)
         self.reward_min = dt_float(0.0)
 
     def initialize(self, env):
@@ -49,7 +48,7 @@ def initialize(self, env):
 
     def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
         if not has_error:
-            res = dt_float(env._nb_time_step * self.per_timestep)
+            res = dt_float(env.nb_time_step * self.per_timestep)
         else:
             res = self.reward_min
         return res