Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#question _on_step method in custom callback #1179

Open
vrige opened this issue Mar 14, 2023 · 0 comments
Open

#question _on_step method in custom callback #1179

vrige opened this issue Mar 14, 2023 · 0 comments

Comments

@vrige
Copy link

vrige commented Mar 14, 2023

#question about _on_step method in custom callback using also a custom wrapper
I am new to stable_baselines and I was wondering if it is normal that the on_step method of a callback start after reset of the env.
Isn't it suppose to start after the step method of the env (or the wrapper in my case)?
I know how I may avoid the issue, but I was wondering if there is a way to call the callback function exactly after the step method of the env.
The env is the custom env from the tutorial (snake env).
The following code should show that the callback print always episode_length and episode_return equal to zero because the env is resetted.

class WrapperStatistics(gym.Wrapper):
    def __init__(self, env: gym.Env, size: int = 250, verbose: int = 0):
        super(WrapperStatistics, self).__init__(env)
        self.verbose = verbose
        self.episode_count = 0
        self.steps_count = 0
        self.episode_rewards = np.empty(size, dtype=float)
        self.episodes_rewards = []
        self.episode_return = 0
        self.episodes_returns = np.empty(0, dtype=float)
        self.episode_length = 0
        self.episodes_lengths = np.empty(0, dtype=float)

    def reset(self, **kwargs):
        obs = super().reset(**kwargs)
        self.episode_rewards = np.empty(self.episode_rewards.size, dtype=float)
        self.episode_length = 0
        self.episode_return = 0
        print("I am resetting")
        return obs

    def step(self, action):
        obs, reward, done, info = self.env.step(action)

        self.episode_length += 1
        self.steps_count += 1

        if self.episode_length == self.episode_rewards.size:
            tmp = np.empty(self.episode_rewards.size, dtype=float)
            self.episode_rewards = np.concatenate((self.episode_rewards, tmp), axis=None)

        self.episode_rewards[self.episode_length] = reward
        self.episode_return += reward

        if done:
            if self.verbose != 0:
                print('Episode: {}, len episode: {}, return episode: {}'.format(self.episode_count, self.episode_length, self.episode_return))
            self.episode_count += 1
            self.episodes_rewards.append(self.episode_rewards)
            self.episodes_returns = np.concatenate((self.episodes_returns, [self.episode_return]), axis=None)
            self.episodes_lengths = np.concatenate((self.episodes_lengths, [self.episode_length]), axis=None)
            if self.verbose == 2:
                print("rewards: " + str(self.episodes_returns))
                print("lengths: " + str(self.episodes_lengths))
            print("REWARD: " + str(self.get_episode_length()) )

        return obs, reward, done, info

    def get_episode_lengths(self):
        return self.episodes_lengths

    def get_episode_length(self):
        return self.episode_length

    def get_episode_rewards(self):
        return self.episodes_returns

    def get_episode_return(self):
        return self.episode_return
    def get_total_steps(self):
        return self.steps_count

    def get_total_episodes(self):
        return self.episode_count

class example(BaseCallback):
    def __init__(self, model, verbose=0):
        super(example, self).__init__(verbose)
        self.model = model
        self.training_env = model.get_env()

    def _on_training_start(self) -> None:
        pass

    def _on_step(self) -> bool:
        if self.locals["dones"]:
            print('Episode: {}, len episode: {}, return episode: {}'.format(
                *self.training_env.env_method("get_total_episodes"),
                *self.training_env.env_method("get_episode_length"),
                *self.training_env.env_method("get_episode_return")))
            print( type(*self.training_env.env_method("get_episode_length") ))

        return True`


env = SnekEnv()
env.reset()

wrapper = WrapperStatistics(env, 250, verbose=0)
wrapper.reset()

model = PPO('MlpPolicy', wrapper, verbose=1, tensorboard_log=logdir)

TIMESTEPS = 1000000

evalcallback = example(model)
callbacks=[evalcallback]

model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=f"PPO", callback=callbacks)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant