From 8b74f8221302dce9b42a05d5ff8ac0bdfc09b611 Mon Sep 17 00:00:00 2001 From: chy <308604256@qq.com> Date: Mon, 22 Mar 2021 21:34:53 +0800 Subject: [PATCH 01/12] refactor vpg & caculate_episodic_returns behavior --- test/base/test_returns.py | 24 +++++------ tianshou/policy/base.py | 33 +++++---------- tianshou/policy/modelfree/a2c.py | 57 +++++++------------------- tianshou/policy/modelfree/pg.py | 57 ++++++++++---------------- tianshou/policy/modelfree/ppo.py | 70 +++++++++++--------------------- 5 files changed, 82 insertions(+), 159 deletions(-) diff --git a/test/base/test_returns.py b/test/base/test_returns.py index e8d70de5c..fcf689036 100644 --- a/test/base/test_returns.py +++ b/test/base/test_returns.py @@ -30,9 +30,9 @@ def test_episodic_returns(size=2560): for b in batch: b.obs = b.act = 1 buf.add(b) - batch = fn(batch, buf, buf.sample_index(0), None, gamma=.1, gae_lambda=1) + returns, _ = fn(batch, buf, buf.sample_index(0), gamma=.1, gae_lambda=1) ans = np.array([0, 1.23, 2.3, 3, 4.5, 5, 6.7, 7]) - assert np.allclose(batch.returns, ans) + assert np.allclose(returns, ans) buf.reset() batch = Batch( done=np.array([0, 1, 0, 1, 0, 1, 0.]), @@ -41,9 +41,9 @@ def test_episodic_returns(size=2560): for b in batch: b.obs = b.act = 1 buf.add(b) - batch = fn(batch, buf, buf.sample_index(0), None, gamma=.1, gae_lambda=1) + returns, _ = fn(batch, buf, buf.sample_index(0), gamma=.1, gae_lambda=1) ans = np.array([7.6, 6, 1.2, 2, 3.4, 4, 5]) - assert np.allclose(batch.returns, ans) + assert np.allclose(returns, ans) buf.reset() batch = Batch( done=np.array([0, 1, 0, 1, 0, 0, 1.]), @@ -52,9 +52,9 @@ def test_episodic_returns(size=2560): for b in batch: b.obs = b.act = 1 buf.add(b) - batch = fn(batch, buf, buf.sample_index(0), None, gamma=.1, gae_lambda=1) + returns, _ = fn(batch, buf, buf.sample_index(0), gamma=.1, gae_lambda=1) ans = np.array([7.6, 6, 1.2, 2, 3.45, 4.5, 5]) - assert np.allclose(batch.returns, ans) + assert np.allclose(returns, ans) buf.reset() batch = Batch( done=np.array([0, 0, 0, 1., 0, 0, 0, 1, 0, 0, 0, 1]), @@ -64,12 +64,12 @@ def test_episodic_returns(size=2560): b.obs = b.act = 1 buf.add(b) v = np.array([2., 3., 4, -1, 5., 6., 7, -2, 8., 9., 10, -3]) - ret = fn(batch, buf, buf.sample_index(0), v, gamma=0.99, gae_lambda=0.95) - returns = np.array([ + returns, _ = fn(batch, buf, buf.sample_index(0), v, gamma=0.99, gae_lambda=0.95) + ground_truth = np.array([ 454.8344, 376.1143, 291.298, 200., 464.5610, 383.1085, 295.387, 201., 474.2876, 390.1027, 299.476, 202.]) - assert np.allclose(ret.returns, returns) + assert np.allclose(returns, ground_truth) buf.reset() batch = Batch( done=np.array([0, 0, 0, 1., 0, 0, 0, 1, 0, 0, 0, 1]), @@ -82,12 +82,12 @@ def test_episodic_returns(size=2560): b.obs = b.act = 1 buf.add(b) v = np.array([2., 3., 4, -1, 5., 6., 7, -2, 8., 9., 10, -3]) - ret = fn(batch, buf, buf.sample_index(0), v, gamma=0.99, gae_lambda=0.95) - returns = np.array([ + returns, _ = fn(batch, buf, buf.sample_index(0), v, gamma=0.99, gae_lambda=0.95) + ground_truth = np.array([ 454.0109, 375.2386, 290.3669, 199.01, 462.9138, 381.3571, 293.5248, 199.02, 474.2876, 390.1027, 299.476, 202.]) - assert np.allclose(ret.returns, returns) + assert np.allclose(returns, ground_truth) if __name__ == '__main__': buf = ReplayBuffer(size) diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py index 1d420173b..6b905087c 100644 --- a/tianshou/policy/base.py +++ b/tianshou/policy/base.py @@ -254,14 +254,14 @@ def compute_episodic_return( buffer: ReplayBuffer, indice: np.ndarray, v_s_: Optional[Union[np.ndarray, torch.Tensor]] = None, + v_s: Optional[Union[np.ndarray, torch.Tensor]] = None, gamma: float = 0.99, gae_lambda: float = 0.95, - rew_norm: bool = False, ) -> Batch: """Compute returns over given batch. Use Implementation of Generalized Advantage Estimator (arXiv:1506.02438) - to calculate q function/reward to go of given batch. + to calculate Q&A function/reward to go of given batch. :param Batch batch: a data batch which contains several episodes of data in sequential order. Mind that the end of each finished episode of batch @@ -273,8 +273,8 @@ def compute_episodic_return( :param float gamma: the discount factor, should be in [0, 1]. Default to 0.99. :param float gae_lambda: the parameter for Generalized Advantage Estimation, should be in [0, 1]. Default to 0.95. - :param bool rew_norm: normalize the reward to Normal(0, 1). Default to False. + # TODO change doc :return: a Batch. The result will be stored in batch.returns as a numpy array with shape (bsz, ). """ @@ -284,14 +284,16 @@ def compute_episodic_return( v_s_ = np.zeros_like(rew) else: v_s_ = to_numpy(v_s_.flatten()) * BasePolicy.value_mask(buffer, indice) + if v_s is None: + v_s = np.roll(v_s_, 1) + else: + v_s = to_numpy(v_s.flatten()) end_flag = batch.done.copy() end_flag[np.isin(indice, buffer.unfinished_index())] = True - returns = _episodic_return(v_s_, rew, end_flag, gamma, gae_lambda) - if rew_norm and not np.isclose(returns.std(), 0.0, 1e-2): - returns = (returns - returns.mean()) / returns.std() - batch.returns = returns - return batch + advantage = _gae_return(v_s, v_s_, rew, end_flag, gamma, gae_lambda) + returns = advantage + v_s + return returns, advantage @staticmethod def compute_nstep_return( @@ -355,8 +357,6 @@ def _compile(self) -> None: i64 = np.array([[0, 1]], dtype=np.int64) _gae_return(f64, f64, f64, b, 0.1, 0.1) _gae_return(f32, f32, f64, b, 0.1, 0.1) - _episodic_return(f64, f64, b, 0.1, 0.1) - _episodic_return(f32, f64, b, 0.1, 0.1) _nstep_return(f64, b, f32.reshape(-1, 1), i64, 0.1, 1) @@ -379,19 +379,6 @@ def _gae_return( return returns -@njit -def _episodic_return( - v_s_: np.ndarray, - rew: np.ndarray, - end_flag: np.ndarray, - gamma: float, - gae_lambda: float, -) -> np.ndarray: - """Numba speedup: 4.1s -> 0.057s.""" - v_s = np.roll(v_s_, 1) - return _gae_return(v_s, v_s_, rew, end_flag, gamma, gae_lambda) + v_s - - @njit def _nstep_return( rew: np.ndarray, diff --git a/tianshou/policy/modelfree/a2c.py b/tianshou/policy/modelfree/a2c.py index 0abf62cd1..a6f4fb0c6 100644 --- a/tianshou/policy/modelfree/a2c.py +++ b/tianshou/policy/modelfree/a2c.py @@ -53,17 +53,14 @@ def __init__( critic: torch.nn.Module, optim: torch.optim.Optimizer, dist_fn: Type[torch.distributions.Distribution], - discount_factor: float = 0.99, vf_coef: float = 0.5, ent_coef: float = 0.01, max_grad_norm: Optional[float] = None, gae_lambda: float = 0.95, - reward_normalization: bool = False, max_batchsize: int = 256, **kwargs: Any ) -> None: - super().__init__(None, optim, dist_fn, discount_factor, **kwargs) - self.actor = actor + super().__init__(actor, optim, dist_fn, **kwargs) self.critic = critic assert 0.0 <= gae_lambda <= 1.0, "GAE lambda should be in [0, 1]." self._lambda = gae_lambda @@ -71,51 +68,27 @@ def __init__( self._weight_ent = ent_coef self._grad_norm = max_grad_norm self._batch = max_batchsize - self._rew_norm = reward_normalization def process_fn( self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray ) -> Batch: - if self._lambda in [0.0, 1.0]: - return self.compute_episodic_return( - batch, buffer, indice, - None, gamma=self._gamma, gae_lambda=self._lambda) - v_ = [] + v_s_ = [] with torch.no_grad(): for b in batch.split(self._batch, shuffle=False, merge_last=True): - v_.append(to_numpy(self.critic(b.obs_next))) - v_ = np.concatenate(v_, axis=0) - return self.compute_episodic_return( - batch, buffer, indice, v_, - gamma=self._gamma, gae_lambda=self._lambda, rew_norm=self._rew_norm) - - def forward( - self, - batch: Batch, - state: Optional[Union[dict, Batch, np.ndarray]] = None, - **kwargs: Any - ) -> Batch: - """Compute action over the given batch data. - - :return: A :class:`~tianshou.data.Batch` which has 4 keys: - - * ``act`` the action. - * ``logits`` the network's raw output. - * ``dist`` the action distribution. - * ``state`` the hidden state. - - .. seealso:: - - Please refer to :meth:`~tianshou.policy.BasePolicy.forward` for - more detailed explanation. - """ - logits, h = self.actor(batch.obs, state=state, info=batch.info) - if isinstance(logits, tuple): - dist = self.dist_fn(*logits) + v_s_.append(to_numpy(self.critic(b.obs_next))) + v_s_ = np.concatenate(v_s_, axis=0) + if self._rew_norm: + # unnormalize v_s_ + v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self.__eps) + self.ret_rms.mean + un_normalized_returns, _ = self.compute_episodic_return( + batch, buffer, indice, v_s_, gamma=self._gamma, gae_lambda=self._lambda) + if self._rew_norm: + batch.returns = (un_normalized_returns - self.ret_rms.mean) / \ + np.sqrt(self.ret_rms.var + self.__eps) + self.ret_rms.update(un_normalized_returns) else: - dist = self.dist_fn(logits) - act = dist.sample() - return Batch(logits=logits, act=act, state=h, dist=dist) + batch.returns = un_normalized_returns + return batch def learn( # type: ignore self, batch: Batch, batch_size: int, repeat: int, **kwargs: Any diff --git a/tianshou/policy/modelfree/pg.py b/tianshou/policy/modelfree/pg.py index 4333112b4..eeb0b891e 100644 --- a/tianshou/policy/modelfree/pg.py +++ b/tianshou/policy/modelfree/pg.py @@ -4,6 +4,8 @@ from tianshou.policy import BasePolicy from tianshou.data import Batch, ReplayBuffer, to_torch_as +from tianshou.utils import RunningMeanStd + class PGPolicy(BasePolicy): @@ -40,19 +42,20 @@ def __init__( reward_normalization: bool = False, action_scaling: bool = True, action_bound_method: str = "clip", - lr_scheduler: Optional[torch.optim.lr_scheduler.LambdaLR] = None, + lr_scheduler: Optional[torch.optim.lr_scheduler] = None, **kwargs: Any, ) -> None: super().__init__(action_scaling=action_scaling, action_bound_method=action_bound_method, **kwargs) - if model is not None: - self.model: torch.nn.Module = model + self.actor = model self.optim = optim self.lr_scheduler = lr_scheduler self.dist_fn = dist_fn assert 0.0 <= discount_factor <= 1.0, "discount factor should be in [0, 1]" self._gamma = discount_factor self._rew_norm = reward_normalization + self.ret_rms = RunningMeanStd() + self.__eps = 1e-8 def process_fn( self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray @@ -65,11 +68,16 @@ def process_fn( where :math:`T` is the terminal time step, :math:`\gamma` is the discount factor, :math:`\gamma \in [0, 1]`. """ - # batch.returns = self._vanilla_returns(batch) - # batch.returns = self._vectorized_returns(batch) - return self.compute_episodic_return( - batch, buffer, indice, gamma=self._gamma, - gae_lambda=1.0, rew_norm=self._rew_norm) + v_s_ = np.full(indice.shape, self.ret_rms.mean) + un_normalized_returns, _ = self.compute_episodic_return( + batch, buffer, indice, v_s_, gamma=self._gamma, gae_lambda=1.0) + if self._rew_norm: + batch.returns = (un_normalized_returns - self.ret_rms.mean) / \ + np.sqrt(self.ret_rms.var + self.__eps) + self.ret_rms.update(un_normalized_returns) + else: + batch.returns = un_normalized_returns + return batch def forward( self, @@ -91,7 +99,7 @@ def forward( Please refer to :meth:`~tianshou.policy.BasePolicy.forward` for more detailed explanation. """ - logits, h = self.model(batch.obs, state=state, info=batch.info) + logits, h = self.actor(batch.obs, state=state) if isinstance(logits, tuple): dist = self.dist_fn(*logits) else: @@ -106,9 +114,10 @@ def learn( # type: ignore for _ in range(repeat): for b in batch.split(batch_size, merge_last=True): self.optim.zero_grad() - dist = self(b).dist - a = to_torch_as(b.act, dist.logits) - r = to_torch_as(b.returns, dist.logits) + result = self(b) + dist = result.dist + a = to_torch_as(b.act, result.act) + r = to_torch_as(b.returns, result.act) log_prob = dist.log_prob(a).reshape(len(r), -1).transpose(0, 1) loss = -(log_prob * r).mean() loss.backward() @@ -119,27 +128,3 @@ def learn( # type: ignore self.lr_scheduler.step() return {"loss": losses} - - # def _vanilla_returns(self, batch): - # returns = batch.rew[:] - # last = 0 - # for i in range(len(returns) - 1, -1, -1): - # if not batch.done[i]: - # returns[i] += self._gamma * last - # last = returns[i] - # return returns - - # def _vectorized_returns(self, batch): - # # according to my tests, it is slower than _vanilla_returns - # # import scipy.signal - # convolve = np.convolve - # # convolve = scipy.signal.convolve - # rew = batch.rew[::-1] - # batch_size = len(rew) - # gammas = self._gamma ** np.arange(batch_size) - # c = convolve(rew, gammas)[:batch_size] - # T = np.where(batch.done[::-1])[0] - # d = np.zeros_like(rew) - # d[T] += c[T] - rew[T] - # d[T[1:]] -= d[T[:-1]] * self._gamma ** np.diff(T) - # return (c - convolve(d, gammas)[:batch_size])[::-1] diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index 4d81dd6cd..aef4aacb2 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -58,7 +58,6 @@ def __init__( critic: torch.nn.Module, optim: torch.optim.Optimizer, dist_fn: Type[torch.distributions.Distribution], - discount_factor: float = 0.99, max_grad_norm: Optional[float] = None, eps_clip: float = 0.2, vf_coef: float = 0.5, @@ -66,16 +65,14 @@ def __init__( gae_lambda: float = 0.95, dual_clip: Optional[float] = None, value_clip: bool = True, - reward_normalization: bool = True, max_batchsize: int = 256, **kwargs: Any, ) -> None: - super().__init__(None, optim, dist_fn, discount_factor, **kwargs) + super().__init__(actor, optim, dist_fn, **kwargs) self._max_grad_norm = max_grad_norm self._eps_clip = eps_clip self._weight_vf = vf_coef self._weight_ent = ent_coef - self.actor = actor self.critic = critic self._batch = max_batchsize assert 0.0 <= gae_lambda <= 1.0, "GAE lambda should be in [0, 1]." @@ -84,7 +81,6 @@ def __init__( "Dual-clip PPO parameter should greater than 1.0." self._dual_clip = dual_clip self._value_clip = value_clip - self._rew_norm = reward_normalization def process_fn( self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray @@ -93,55 +89,37 @@ def process_fn( mean, std = batch.rew.mean(), batch.rew.std() if not np.isclose(std, 0.0, 1e-2): batch.rew = (batch.rew - mean) / std - v, v_, old_log_prob = [], [], [] + v_s, v_s_, old_log_prob = [], [], [] with torch.no_grad(): for b in batch.split(self._batch, shuffle=False, merge_last=True): - v_.append(self.critic(b.obs_next)) - v.append(self.critic(b.obs)) - old_log_prob.append(self(b).dist.log_prob(to_torch_as(b.act, v[0]))) - v_ = to_numpy(torch.cat(v_, dim=0)) - batch = self.compute_episodic_return( - batch, buffer, indice, v_, gamma=self._gamma, - gae_lambda=self._lambda, rew_norm=self._rew_norm) - batch.v = torch.cat(v, dim=0).flatten() # old value - batch.act = to_torch_as(batch.act, v[0]) + v_s_.append(self.critic(b.obs_next)) + v_s.append(self.critic(b.obs)) + old_log_prob.append(self(b).dist.log_prob(to_torch_as(b.act, v_s[0]))) + batch.v_s = torch.cat(v_s, dim=0).flatten() # old value + v_s_ = to_numpy(torch.cat(v_s_, dim=0).flatten()) + v_s = to_numpy(batch.v_s) + if self._rew_norm: + # unnormalize v_s_ & v_s + v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self.__eps) + self.ret_rms.mean + v_s = v_s * np.sqrt(self.ret_rms.var + self.__eps) + self.ret_rms.mean + un_normalized_returns, advantages = self.compute_episodic_return( + batch, buffer, indice, v_s_, v_s, gamma=self._gamma, gae_lambda=self._lambda) + if self._rew_norm: + batch.returns = (un_normalized_returns - self.ret_rms.mean) / \ + np.sqrt(self.ret_rms.var + self.__eps) + self.ret_rms.update(un_normalized_returns) + else: + batch.returns = un_normalized_returns + batch.act = to_torch_as(batch.act, v_s[0]) batch.logp_old = torch.cat(old_log_prob, dim=0) - batch.returns = to_torch_as(batch.returns, v[0]) - batch.adv = batch.returns - batch.v + batch.returns = to_torch_as(batch.returns, v_s[0]) + batch.adv = to_torch_as(advantages, v_s[0]) if self._rew_norm: mean, std = batch.adv.mean(), batch.adv.std() if not np.isclose(std.item(), 0.0, 1e-2): batch.adv = (batch.adv - mean) / std return batch - def forward( - self, - batch: Batch, - state: Optional[Union[dict, Batch, np.ndarray]] = None, - **kwargs: Any, - ) -> Batch: - """Compute action over the given batch data. - - :return: A :class:`~tianshou.data.Batch` which has 4 keys: - - * ``act`` the action. - * ``logits`` the network's raw output. - * ``dist`` the action distribution. - * ``state`` the hidden state. - - .. seealso:: - - Please refer to :meth:`~tianshou.policy.BasePolicy.forward` for - more detailed explanation. - """ - logits, h = self.actor(batch.obs, state=state, info=batch.info) - if isinstance(logits, tuple): - dist = self.dist_fn(*logits) - else: - dist = self.dist_fn(logits) - act = dist.sample() - return Batch(logits=logits, act=act, state=h, dist=dist) - def learn( # type: ignore self, batch: Batch, batch_size: int, repeat: int, **kwargs: Any ) -> Dict[str, List[float]]: @@ -162,7 +140,7 @@ def learn( # type: ignore clip_loss = -torch.min(surr1, surr2).mean() clip_losses.append(clip_loss.item()) if self._value_clip: - v_clip = b.v + (value - b.v).clamp(-self._eps_clip, self._eps_clip) + v_clip = b.v_s + (value - b.v_s).clamp(-self._eps_clip, self._eps_clip) vf1 = (b.returns - value).pow(2) vf2 = (b.returns - v_clip).pow(2) vf_loss = 0.5 * torch.max(vf1, vf2).mean() From 66c102dad6dc4412a5bac893c86749c0c62abf12 Mon Sep 17 00:00:00 2001 From: chy <308604256@qq.com> Date: Mon, 22 Mar 2021 21:50:59 +0800 Subject: [PATCH 02/12] fix bug --- test/continuous/test_ppo.py | 3 ++- test/discrete/test_a2c_with_il.py | 3 ++- test/discrete/test_ppo.py | 3 ++- tianshou/policy/modelfree/a2c.py | 4 ++-- tianshou/policy/modelfree/pg.py | 6 +++--- tianshou/policy/modelfree/ppo.py | 12 ++++++------ 6 files changed, 17 insertions(+), 14 deletions(-) diff --git a/test/continuous/test_ppo.py b/test/continuous/test_ppo.py index 895d3c1f5..d21144dc3 100644 --- a/test/continuous/test_ppo.py +++ b/test/continuous/test_ppo.py @@ -91,7 +91,8 @@ def test_ppo(args=get_args()): def dist(*logits): return Independent(Normal(*logits), 1) policy = PPOPolicy( - actor, critic, optim, dist, args.gamma, + actor, critic, optim, dist, + discount_factor = args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, diff --git a/test/discrete/test_a2c_with_il.py b/test/discrete/test_a2c_with_il.py index 323d14848..724269e81 100644 --- a/test/discrete/test_a2c_with_il.py +++ b/test/discrete/test_a2c_with_il.py @@ -78,7 +78,8 @@ def test_a2c_with_il(args=get_args()): actor.parameters()).union(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = A2CPolicy( - actor, critic, optim, dist, args.gamma, gae_lambda=args.gae_lambda, + actor, critic, optim, dist, + discount_factor = args.gamma, gae_lambda=args.gae_lambda, vf_coef=args.vf_coef, ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm, reward_normalization=args.rew_norm, action_space=env.action_space) diff --git a/test/discrete/test_ppo.py b/test/discrete/test_ppo.py index 11428dc0d..7f50cd2be 100644 --- a/test/discrete/test_ppo.py +++ b/test/discrete/test_ppo.py @@ -80,7 +80,8 @@ def test_ppo(args=get_args()): actor.parameters()).union(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy( - actor, critic, optim, dist, args.gamma, + actor, critic, optim, dist, + discount_factor = args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, diff --git a/tianshou/policy/modelfree/a2c.py b/tianshou/policy/modelfree/a2c.py index a6f4fb0c6..fbb389ae6 100644 --- a/tianshou/policy/modelfree/a2c.py +++ b/tianshou/policy/modelfree/a2c.py @@ -79,12 +79,12 @@ def process_fn( v_s_ = np.concatenate(v_s_, axis=0) if self._rew_norm: # unnormalize v_s_ - v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self.__eps) + self.ret_rms.mean + v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean un_normalized_returns, _ = self.compute_episodic_return( batch, buffer, indice, v_s_, gamma=self._gamma, gae_lambda=self._lambda) if self._rew_norm: batch.returns = (un_normalized_returns - self.ret_rms.mean) / \ - np.sqrt(self.ret_rms.var + self.__eps) + np.sqrt(self.ret_rms.var + self._eps) self.ret_rms.update(un_normalized_returns) else: batch.returns = un_normalized_returns diff --git a/tianshou/policy/modelfree/pg.py b/tianshou/policy/modelfree/pg.py index eeb0b891e..b4004de81 100644 --- a/tianshou/policy/modelfree/pg.py +++ b/tianshou/policy/modelfree/pg.py @@ -42,7 +42,7 @@ def __init__( reward_normalization: bool = False, action_scaling: bool = True, action_bound_method: str = "clip", - lr_scheduler: Optional[torch.optim.lr_scheduler] = None, + lr_scheduler: Optional[torch.optim.lr_scheduler.LambdaLR] = None, **kwargs: Any, ) -> None: super().__init__(action_scaling=action_scaling, @@ -55,7 +55,7 @@ def __init__( self._gamma = discount_factor self._rew_norm = reward_normalization self.ret_rms = RunningMeanStd() - self.__eps = 1e-8 + self._eps = 1e-8 def process_fn( self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray @@ -73,7 +73,7 @@ def process_fn( batch, buffer, indice, v_s_, gamma=self._gamma, gae_lambda=1.0) if self._rew_norm: batch.returns = (un_normalized_returns - self.ret_rms.mean) / \ - np.sqrt(self.ret_rms.var + self.__eps) + np.sqrt(self.ret_rms.var + self._eps) self.ret_rms.update(un_normalized_returns) else: batch.returns = un_normalized_returns diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index aef4aacb2..f9165eb52 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -100,20 +100,20 @@ def process_fn( v_s = to_numpy(batch.v_s) if self._rew_norm: # unnormalize v_s_ & v_s - v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self.__eps) + self.ret_rms.mean - v_s = v_s * np.sqrt(self.ret_rms.var + self.__eps) + self.ret_rms.mean + v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean + v_s = v_s * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean un_normalized_returns, advantages = self.compute_episodic_return( batch, buffer, indice, v_s_, v_s, gamma=self._gamma, gae_lambda=self._lambda) if self._rew_norm: batch.returns = (un_normalized_returns - self.ret_rms.mean) / \ - np.sqrt(self.ret_rms.var + self.__eps) + np.sqrt(self.ret_rms.var + self._eps) self.ret_rms.update(un_normalized_returns) else: batch.returns = un_normalized_returns - batch.act = to_torch_as(batch.act, v_s[0]) + batch.act = to_torch_as(batch.act, batch.v_s[0]) batch.logp_old = torch.cat(old_log_prob, dim=0) - batch.returns = to_torch_as(batch.returns, v_s[0]) - batch.adv = to_torch_as(advantages, v_s[0]) + batch.returns = to_torch_as(batch.returns, batch.v_s[0]) + batch.adv = to_torch_as(advantages, batch.v_s[0]) if self._rew_norm: mean, std = batch.adv.mean(), batch.adv.std() if not np.isclose(std.item(), 0.0, 1e-2): From 52203ce7f2fcc0527cae621f52563639595ba9cd Mon Sep 17 00:00:00 2001 From: chy <308604256@qq.com> Date: Mon, 22 Mar 2021 21:56:28 +0800 Subject: [PATCH 03/12] change docstring --- tianshou/policy/modelfree/pg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tianshou/policy/modelfree/pg.py b/tianshou/policy/modelfree/pg.py index b4004de81..8dbdf3eff 100644 --- a/tianshou/policy/modelfree/pg.py +++ b/tianshou/policy/modelfree/pg.py @@ -9,7 +9,7 @@ class PGPolicy(BasePolicy): - """Implementation of Vanilla Policy Gradient. + """Implementation of REINFORCE algorithm. :param torch.nn.Module model: a model following the rules in :class:`~tianshou.policy.BasePolicy`. (s -> logits) From fa18832cac50b6973699330d11b522592e485485 Mon Sep 17 00:00:00 2001 From: chy <308604256@qq.com> Date: Mon, 22 Mar 2021 22:09:15 +0800 Subject: [PATCH 04/12] pep8 fix --- tianshou/policy/modelfree/a2c.py | 2 +- tianshou/policy/modelfree/pg.py | 1 - tianshou/policy/modelfree/ppo.py | 10 ++++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tianshou/policy/modelfree/a2c.py b/tianshou/policy/modelfree/a2c.py index fbb389ae6..806201b4a 100644 --- a/tianshou/policy/modelfree/a2c.py +++ b/tianshou/policy/modelfree/a2c.py @@ -2,7 +2,7 @@ import numpy as np from torch import nn import torch.nn.functional as F -from typing import Any, Dict, List, Type, Union, Optional +from typing import Any, Dict, List, Type, Optional from tianshou.policy import PGPolicy from tianshou.data import Batch, ReplayBuffer, to_torch_as, to_numpy diff --git a/tianshou/policy/modelfree/pg.py b/tianshou/policy/modelfree/pg.py index 8dbdf3eff..1c43a1b99 100644 --- a/tianshou/policy/modelfree/pg.py +++ b/tianshou/policy/modelfree/pg.py @@ -7,7 +7,6 @@ from tianshou.utils import RunningMeanStd - class PGPolicy(BasePolicy): """Implementation of REINFORCE algorithm. diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index f9165eb52..ad9f9e3b8 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -1,7 +1,7 @@ import torch import numpy as np from torch import nn -from typing import Any, Dict, List, Type, Union, Optional +from typing import Any, Dict, List, Type, Optional from tianshou.policy import PGPolicy from tianshou.data import Batch, ReplayBuffer, to_numpy, to_torch_as @@ -103,10 +103,11 @@ def process_fn( v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean v_s = v_s * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean un_normalized_returns, advantages = self.compute_episodic_return( - batch, buffer, indice, v_s_, v_s, gamma=self._gamma, gae_lambda=self._lambda) + batch, buffer, indice, v_s_, v_s, + gamma=self._gamma, gae_lambda=self._lambda) if self._rew_norm: batch.returns = (un_normalized_returns - self.ret_rms.mean) / \ - np.sqrt(self.ret_rms.var + self._eps) + np.sqrt(self.ret_rms.var + self._eps) self.ret_rms.update(un_normalized_returns) else: batch.returns = un_normalized_returns @@ -140,7 +141,8 @@ def learn( # type: ignore clip_loss = -torch.min(surr1, surr2).mean() clip_losses.append(clip_loss.item()) if self._value_clip: - v_clip = b.v_s + (value - b.v_s).clamp(-self._eps_clip, self._eps_clip) + v_clip = b.v_s + (value - b.v_s).clamp( + -self._eps_clip, self._eps_clip) vf1 = (b.returns - value).pow(2) vf2 = (b.returns - v_clip).pow(2) vf_loss = 0.5 * torch.max(vf1, vf2).mean() From b02c9d23ab7c1c02963247c52c078b881cb02157 Mon Sep 17 00:00:00 2001 From: chy <308604256@qq.com> Date: Mon, 22 Mar 2021 22:33:56 +0800 Subject: [PATCH 05/12] pep8fix --- test/continuous/test_ppo.py | 2 +- test/discrete/test_a2c_with_il.py | 2 +- test/discrete/test_ppo.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/continuous/test_ppo.py b/test/continuous/test_ppo.py index d21144dc3..336e4b673 100644 --- a/test/continuous/test_ppo.py +++ b/test/continuous/test_ppo.py @@ -92,7 +92,7 @@ def dist(*logits): return Independent(Normal(*logits), 1) policy = PPOPolicy( actor, critic, optim, dist, - discount_factor = args.gamma, + discount_factor=args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, diff --git a/test/discrete/test_a2c_with_il.py b/test/discrete/test_a2c_with_il.py index 724269e81..e9003ce8b 100644 --- a/test/discrete/test_a2c_with_il.py +++ b/test/discrete/test_a2c_with_il.py @@ -79,7 +79,7 @@ def test_a2c_with_il(args=get_args()): dist = torch.distributions.Categorical policy = A2CPolicy( actor, critic, optim, dist, - discount_factor = args.gamma, gae_lambda=args.gae_lambda, + discount_factor=args.gamma, gae_lambda=args.gae_lambda, vf_coef=args.vf_coef, ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm, reward_normalization=args.rew_norm, action_space=env.action_space) diff --git a/test/discrete/test_ppo.py b/test/discrete/test_ppo.py index 7f50cd2be..8ba380e9e 100644 --- a/test/discrete/test_ppo.py +++ b/test/discrete/test_ppo.py @@ -81,7 +81,7 @@ def test_ppo(args=get_args()): dist = torch.distributions.Categorical policy = PPOPolicy( actor, critic, optim, dist, - discount_factor = args.gamma, + discount_factor=args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, From 8902852f1688c86341ffb226a3003b2914373693 Mon Sep 17 00:00:00 2001 From: chy <308604256@qq.com> Date: Tue, 23 Mar 2021 14:21:08 +0800 Subject: [PATCH 06/12] fix test --- test/discrete/test_pg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/discrete/test_pg.py b/test/discrete/test_pg.py index 83e3c1f6b..3e150330f 100644 --- a/test/discrete/test_pg.py +++ b/test/discrete/test_pg.py @@ -27,7 +27,7 @@ def get_args(): parser.add_argument('--repeat-per-collect', type=int, default=2) parser.add_argument('--batch-size', type=int, default=64) parser.add_argument('--hidden-sizes', type=int, - nargs='*', default=[128, 128, 128, 128]) + nargs='*', default=[64, 64]) parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') From dc1312d1547e5eb276d8fbbdd562a4c6054f2621 Mon Sep 17 00:00:00 2001 From: chy <308604256@qq.com> Date: Tue, 23 Mar 2021 14:47:32 +0800 Subject: [PATCH 07/12] fix test & pep8 --- test/discrete/test_pg.py | 7 ++++++- tianshou/policy/base.py | 4 ++-- tianshou/policy/modelfree/a2c.py | 11 ++++++----- tianshou/policy/modelfree/pg.py | 10 +++++----- tianshou/policy/modelfree/ppo.py | 8 ++++---- 5 files changed, 23 insertions(+), 17 deletions(-) diff --git a/test/discrete/test_pg.py b/test/discrete/test_pg.py index 3e150330f..d96609a26 100644 --- a/test/discrete/test_pg.py +++ b/test/discrete/test_pg.py @@ -17,7 +17,7 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') - parser.add_argument('--seed', type=int, default=0) + parser.add_argument('--seed', type=int, default=1) parser.add_argument('--buffer-size', type=int, default=20000) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--gamma', type=float, default=0.95) @@ -65,6 +65,11 @@ def test_pg(args=get_args()): policy = PGPolicy(net, optim, dist, args.gamma, reward_normalization=args.rew_norm, action_space=env.action_space) + for m in net.modules(): + if isinstance(m, torch.nn.Linear): + # orthogonal initialization + torch.nn.init.orthogonal_(m.weight, gain=np.sqrt(2)) + torch.nn.init.zeros_(m.bias) # collector train_collector = Collector( policy, train_envs, diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py index 6b905087c..9d75ff94c 100644 --- a/tianshou/policy/base.py +++ b/tianshou/policy/base.py @@ -4,7 +4,7 @@ from torch import nn from numba import njit from abc import ABC, abstractmethod -from typing import Any, Dict, Union, Optional, Callable +from typing import Any, Dict, Union, Optional, Callable, Tuple from tianshou.data import Batch, ReplayBuffer, to_torch_as, to_numpy @@ -257,7 +257,7 @@ def compute_episodic_return( v_s: Optional[Union[np.ndarray, torch.Tensor]] = None, gamma: float = 0.99, gae_lambda: float = 0.95, - ) -> Batch: + ) -> Tuple[np.ndarray, np.ndarray]: """Compute returns over given batch. Use Implementation of Generalized Advantage Estimator (arXiv:1506.02438) diff --git a/tianshou/policy/modelfree/a2c.py b/tianshou/policy/modelfree/a2c.py index 806201b4a..db19f1302 100644 --- a/tianshou/policy/modelfree/a2c.py +++ b/tianshou/policy/modelfree/a2c.py @@ -80,14 +80,15 @@ def process_fn( if self._rew_norm: # unnormalize v_s_ v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean - un_normalized_returns, _ = self.compute_episodic_return( - batch, buffer, indice, v_s_, gamma=self._gamma, gae_lambda=self._lambda) + unnormalized_returns, _ = self.compute_episodic_return( + batch, buffer, indice, v_s_=v_s_, + gamma=self._gamma, gae_lambda=self._lambda) if self._rew_norm: - batch.returns = (un_normalized_returns - self.ret_rms.mean) / \ + batch.returns = (unnormalized_returns - self.ret_rms.mean) / \ np.sqrt(self.ret_rms.var + self._eps) - self.ret_rms.update(un_normalized_returns) + self.ret_rms.update(unnormalized_returns) else: - batch.returns = un_normalized_returns + batch.returns = unnormalized_returns return batch def learn( # type: ignore diff --git a/tianshou/policy/modelfree/pg.py b/tianshou/policy/modelfree/pg.py index 1c43a1b99..f2207b268 100644 --- a/tianshou/policy/modelfree/pg.py +++ b/tianshou/policy/modelfree/pg.py @@ -68,14 +68,14 @@ def process_fn( discount factor, :math:`\gamma \in [0, 1]`. """ v_s_ = np.full(indice.shape, self.ret_rms.mean) - un_normalized_returns, _ = self.compute_episodic_return( - batch, buffer, indice, v_s_, gamma=self._gamma, gae_lambda=1.0) + unnormalized_returns, _ = self.compute_episodic_return( + batch, buffer, indice, v_s_=v_s_, gamma=self._gamma, gae_lambda=1.0) if self._rew_norm: - batch.returns = (un_normalized_returns - self.ret_rms.mean) / \ + batch.returns = (unnormalized_returns - self.ret_rms.mean) / \ np.sqrt(self.ret_rms.var + self._eps) - self.ret_rms.update(un_normalized_returns) + self.ret_rms.update(unnormalized_returns) else: - batch.returns = un_normalized_returns + batch.returns = unnormalized_returns return batch def forward( diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index ad9f9e3b8..02bf0aadb 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -102,15 +102,15 @@ def process_fn( # unnormalize v_s_ & v_s v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean v_s = v_s * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean - un_normalized_returns, advantages = self.compute_episodic_return( + unnormalized_returns, advantages = self.compute_episodic_return( batch, buffer, indice, v_s_, v_s, gamma=self._gamma, gae_lambda=self._lambda) if self._rew_norm: - batch.returns = (un_normalized_returns - self.ret_rms.mean) / \ + batch.returns = (unnormalized_returns - self.ret_rms.mean) / \ np.sqrt(self.ret_rms.var + self._eps) - self.ret_rms.update(un_normalized_returns) + self.ret_rms.update(unnormalized_returns) else: - batch.returns = un_normalized_returns + batch.returns = unnormalized_returns batch.act = to_torch_as(batch.act, batch.v_s[0]) batch.logp_old = torch.cat(old_log_prob, dim=0) batch.returns = to_torch_as(batch.returns, batch.v_s[0]) From f278a27a3df820f005cf7162ccab4fc4172e648f Mon Sep 17 00:00:00 2001 From: ChenDRAG <40993476+ChenDRAG@users.noreply.github.com> Date: Tue, 23 Mar 2021 14:49:46 +0800 Subject: [PATCH 08/12] Update base.py minor --- tianshou/policy/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py index 9d75ff94c..e6fe0032d 100644 --- a/tianshou/policy/base.py +++ b/tianshou/policy/base.py @@ -261,7 +261,7 @@ def compute_episodic_return( """Compute returns over given batch. Use Implementation of Generalized Advantage Estimator (arXiv:1506.02438) - to calculate Q&A function/reward to go of given batch. + to calculate Q value and advantage of given batch. :param Batch batch: a data batch which contains several episodes of data in sequential order. Mind that the end of each finished episode of batch From 0b7d76f265f3e48beffbdb4e61a4012a6ef56e26 Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Tue, 23 Mar 2021 16:12:18 +0800 Subject: [PATCH 09/12] fix mypy --- tianshou/policy/modelfree/a2c.py | 2 +- tianshou/policy/modelfree/pg.py | 4 ++-- tianshou/policy/modelfree/ppo.py | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tianshou/policy/modelfree/a2c.py b/tianshou/policy/modelfree/a2c.py index db19f1302..3a7af5792 100644 --- a/tianshou/policy/modelfree/a2c.py +++ b/tianshou/policy/modelfree/a2c.py @@ -85,7 +85,7 @@ def process_fn( gamma=self._gamma, gae_lambda=self._lambda) if self._rew_norm: batch.returns = (unnormalized_returns - self.ret_rms.mean) / \ - np.sqrt(self.ret_rms.var + self._eps) + np.sqrt(self.ret_rms.var + self._eps) self.ret_rms.update(unnormalized_returns) else: batch.returns = unnormalized_returns diff --git a/tianshou/policy/modelfree/pg.py b/tianshou/policy/modelfree/pg.py index f2207b268..ac06f1c00 100644 --- a/tianshou/policy/modelfree/pg.py +++ b/tianshou/policy/modelfree/pg.py @@ -34,7 +34,7 @@ class PGPolicy(BasePolicy): def __init__( self, - model: Optional[torch.nn.Module], + model: torch.nn.Module, optim: torch.optim.Optimizer, dist_fn: Type[torch.distributions.Distribution], discount_factor: float = 0.99, @@ -72,7 +72,7 @@ def process_fn( batch, buffer, indice, v_s_=v_s_, gamma=self._gamma, gae_lambda=1.0) if self._rew_norm: batch.returns = (unnormalized_returns - self.ret_rms.mean) / \ - np.sqrt(self.ret_rms.var + self._eps) + np.sqrt(self.ret_rms.var + self._eps) self.ret_rms.update(unnormalized_returns) else: batch.returns = unnormalized_returns diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index 02bf0aadb..89ab3081a 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -103,8 +103,8 @@ def process_fn( v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean v_s = v_s * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean unnormalized_returns, advantages = self.compute_episodic_return( - batch, buffer, indice, v_s_, v_s, - gamma=self._gamma, gae_lambda=self._lambda) + batch, buffer, indice, v_s_, v_s, + gamma=self._gamma, gae_lambda=self._lambda) if self._rew_norm: batch.returns = (unnormalized_returns - self.ret_rms.mean) / \ np.sqrt(self.ret_rms.var + self._eps) @@ -116,8 +116,8 @@ def process_fn( batch.returns = to_torch_as(batch.returns, batch.v_s[0]) batch.adv = to_torch_as(advantages, batch.v_s[0]) if self._rew_norm: - mean, std = batch.adv.mean(), batch.adv.std() - if not np.isclose(std.item(), 0.0, 1e-2): + mean, std = np.mean(advantages), np.std(advantages) + if not np.isclose(std, 0.0, 1e-2): batch.adv = (batch.adv - mean) / std return batch @@ -142,7 +142,7 @@ def learn( # type: ignore clip_losses.append(clip_loss.item()) if self._value_clip: v_clip = b.v_s + (value - b.v_s).clamp( - -self._eps_clip, self._eps_clip) + -self._eps_clip, self._eps_clip) vf1 = (b.returns - value).pow(2) vf2 = (b.returns - v_clip).pow(2) vf_loss = 0.5 * torch.max(vf1, vf2).mean() From b0e7f5c0a6fda5df7d6cb92734d2cd3b52589ff2 Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Tue, 23 Mar 2021 16:30:49 +0800 Subject: [PATCH 10/12] fix test --- tianshou/policy/base.py | 4 +--- tianshou/policy/modelfree/ppo.py | 30 +++++++++++++----------------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py index e6fe0032d..ee294d8aa 100644 --- a/tianshou/policy/base.py +++ b/tianshou/policy/base.py @@ -274,9 +274,7 @@ def compute_episodic_return( :param float gae_lambda: the parameter for Generalized Advantage Estimation, should be in [0, 1]. Default to 0.95. - # TODO change doc - :return: a Batch. The result will be stored in batch.returns as a numpy - array with shape (bsz, ). + :return: two numpy arrays (returns, advantage) with each shape (bsz, ). """ rew = batch.rew if v_s_ is None: diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index 89ab3081a..e1192ba1b 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -3,11 +3,11 @@ from torch import nn from typing import Any, Dict, List, Type, Optional -from tianshou.policy import PGPolicy +from tianshou.policy import A2CPolicy from tianshou.data import Batch, ReplayBuffer, to_numpy, to_torch_as -class PPOPolicy(PGPolicy): +class PPOPolicy(A2CPolicy): r"""Implementation of Proximal Policy Optimization. arXiv:1707.06347. :param torch.nn.Module actor: the actor network following the rules in @@ -30,8 +30,8 @@ class PPOPolicy(PGPolicy): Default to 5.0 (set None if you do not want to use it). :param bool value_clip: a parameter mentioned in arXiv:1811.02553 Sec. 4.1. Default to True. - :param bool reward_normalization: normalize the returns to Normal(0, 1). - Default to True. + :param bool reward_normalization: normalize the returns and advantage to + Normal(0, 1). Default to False. :param int max_batchsize: the maximum size of the batch when computing GAE, depends on the size of available memory and the memory cost of the model; should be as large as possible within the memory constraint. @@ -68,15 +68,11 @@ def __init__( max_batchsize: int = 256, **kwargs: Any, ) -> None: - super().__init__(actor, optim, dist_fn, **kwargs) - self._max_grad_norm = max_grad_norm + super().__init__( + actor, critic, optim, dist_fn, max_grad_norm=max_grad_norm, + vf_coef=vf_coef, ent_coef=ent_coef, gae_lambda=gae_lambda, + max_batchsize=max_batchsize, **kwargs) self._eps_clip = eps_clip - self._weight_vf = vf_coef - self._weight_ent = ent_coef - self.critic = critic - self._batch = max_batchsize - assert 0.0 <= gae_lambda <= 1.0, "GAE lambda should be in [0, 1]." - self._lambda = gae_lambda assert dual_clip is None or dual_clip > 1.0, \ "Dual-clip PPO parameter should greater than 1.0." self._dual_clip = dual_clip @@ -111,10 +107,10 @@ def process_fn( self.ret_rms.update(unnormalized_returns) else: batch.returns = unnormalized_returns - batch.act = to_torch_as(batch.act, batch.v_s[0]) + batch.act = to_torch_as(batch.act, batch.v_s) batch.logp_old = torch.cat(old_log_prob, dim=0) - batch.returns = to_torch_as(batch.returns, batch.v_s[0]) - batch.adv = to_torch_as(advantages, batch.v_s[0]) + batch.returns = to_torch_as(batch.returns, batch.v_s) + batch.adv = to_torch_as(advantages, batch.v_s) if self._rew_norm: mean, std = np.mean(advantages), np.std(advantages) if not np.isclose(std, 0.0, 1e-2): @@ -156,10 +152,10 @@ def learn( # type: ignore losses.append(loss.item()) self.optim.zero_grad() loss.backward() - if self._max_grad_norm: + if self._grad_norm is not None: nn.utils.clip_grad_norm_( list(self.actor.parameters()) + list(self.critic.parameters()), - self._max_grad_norm) + self._grad_norm) self.optim.step() # update learning rate if lr_scheduler is given if self.lr_scheduler is not None: From f11a76a9342c89dcbb9220e9cf22d482d8d8f26a Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Tue, 23 Mar 2021 16:45:27 +0800 Subject: [PATCH 11/12] fix bug --- tianshou/policy/base.py | 5 +++-- tianshou/policy/modelfree/ppo.py | 10 ++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py index ee294d8aa..010951f6c 100644 --- a/tianshou/policy/base.py +++ b/tianshou/policy/base.py @@ -4,7 +4,7 @@ from torch import nn from numba import njit from abc import ABC, abstractmethod -from typing import Any, Dict, Union, Optional, Callable, Tuple +from typing import Any, Dict, Tuple, Union, Optional, Callable from tianshou.data import Batch, ReplayBuffer, to_torch_as, to_numpy @@ -261,7 +261,7 @@ def compute_episodic_return( """Compute returns over given batch. Use Implementation of Generalized Advantage Estimator (arXiv:1506.02438) - to calculate Q value and advantage of given batch. + to calculate q/advantage value of given batch. :param Batch batch: a data batch which contains several episodes of data in sequential order. Mind that the end of each finished episode of batch @@ -291,6 +291,7 @@ def compute_episodic_return( end_flag[np.isin(indice, buffer.unfinished_index())] = True advantage = _gae_return(v_s, v_s_, rew, end_flag, gamma, gae_lambda) returns = advantage + v_s + # normalization is varied from each policy, so we don't do it here return returns, advantage @staticmethod diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index e1192ba1b..30b2a2f4d 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -81,10 +81,6 @@ def __init__( def process_fn( self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray ) -> Batch: - if self._rew_norm: - mean, std = batch.rew.mean(), batch.rew.std() - if not np.isclose(std, 0.0, 1e-2): - batch.rew = (batch.rew - mean) / std v_s, v_s_, old_log_prob = [], [], [] with torch.no_grad(): for b in batch.split(self._batch, shuffle=False, merge_last=True): @@ -105,16 +101,14 @@ def process_fn( batch.returns = (unnormalized_returns - self.ret_rms.mean) / \ np.sqrt(self.ret_rms.var + self._eps) self.ret_rms.update(unnormalized_returns) + mean, std = np.mean(advantages), np.std(advantages) + advantages = (advantages - mean) / std else: batch.returns = unnormalized_returns batch.act = to_torch_as(batch.act, batch.v_s) batch.logp_old = torch.cat(old_log_prob, dim=0) batch.returns = to_torch_as(batch.returns, batch.v_s) batch.adv = to_torch_as(advantages, batch.v_s) - if self._rew_norm: - mean, std = np.mean(advantages), np.std(advantages) - if not np.isclose(std, 0.0, 1e-2): - batch.adv = (batch.adv - mean) / std return batch def learn( # type: ignore From 991b4e054194c23c800a286c24aecc1f255ba09f Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Tue, 23 Mar 2021 21:32:51 +0800 Subject: [PATCH 12/12] update --- tianshou/policy/base.py | 7 ++----- tianshou/policy/modelfree/a2c.py | 3 +-- tianshou/policy/modelfree/ppo.py | 11 +++++------ 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py index 010951f6c..b29706575 100644 --- a/tianshou/policy/base.py +++ b/tianshou/policy/base.py @@ -282,16 +282,13 @@ def compute_episodic_return( v_s_ = np.zeros_like(rew) else: v_s_ = to_numpy(v_s_.flatten()) * BasePolicy.value_mask(buffer, indice) - if v_s is None: - v_s = np.roll(v_s_, 1) - else: - v_s = to_numpy(v_s.flatten()) + v_s = np.roll(v_s_, 1) if v_s is None else to_numpy(v_s.flatten()) end_flag = batch.done.copy() end_flag[np.isin(indice, buffer.unfinished_index())] = True advantage = _gae_return(v_s, v_s_, rew, end_flag, gamma, gae_lambda) returns = advantage + v_s - # normalization is varied from each policy, so we don't do it here + # normalization varies from each policy, so we don't do it here return returns, advantage @staticmethod diff --git a/tianshou/policy/modelfree/a2c.py b/tianshou/policy/modelfree/a2c.py index 3a7af5792..3dd1e561a 100644 --- a/tianshou/policy/modelfree/a2c.py +++ b/tianshou/policy/modelfree/a2c.py @@ -77,8 +77,7 @@ def process_fn( for b in batch.split(self._batch, shuffle=False, merge_last=True): v_s_.append(to_numpy(self.critic(b.obs_next))) v_s_ = np.concatenate(v_s_, axis=0) - if self._rew_norm: - # unnormalize v_s_ + if self._rew_norm: # unnormalize v_s_ v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean unnormalized_returns, _ = self.compute_episodic_return( batch, buffer, indice, v_s_=v_s_, diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index 30b2a2f4d..db7a22c6f 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -84,16 +84,15 @@ def process_fn( v_s, v_s_, old_log_prob = [], [], [] with torch.no_grad(): for b in batch.split(self._batch, shuffle=False, merge_last=True): - v_s_.append(self.critic(b.obs_next)) v_s.append(self.critic(b.obs)) + v_s_.append(self.critic(b.obs_next)) old_log_prob.append(self(b).dist.log_prob(to_torch_as(b.act, v_s[0]))) batch.v_s = torch.cat(v_s, dim=0).flatten() # old value - v_s_ = to_numpy(torch.cat(v_s_, dim=0).flatten()) v_s = to_numpy(batch.v_s) - if self._rew_norm: - # unnormalize v_s_ & v_s - v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean + v_s_ = to_numpy(torch.cat(v_s_, dim=0).flatten()) + if self._rew_norm: # unnormalize v_s & v_s_ v_s = v_s * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean + v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean unnormalized_returns, advantages = self.compute_episodic_return( batch, buffer, indice, v_s_, v_s, gamma=self._gamma, gae_lambda=self._lambda) @@ -102,7 +101,7 @@ def process_fn( np.sqrt(self.ret_rms.var + self._eps) self.ret_rms.update(unnormalized_returns) mean, std = np.mean(advantages), np.std(advantages) - advantages = (advantages - mean) / std + advantages = (advantages - mean) / std # per-batch norm else: batch.returns = unnormalized_returns batch.act = to_torch_as(batch.act, batch.v_s)