Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

demo(nyz): slime volleyball league training #229

Merged
merged 7 commits into from
Mar 19, 2022
7 changes: 3 additions & 4 deletions ding/worker/collector/battle_interaction_serial_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def eval(
train_iter: int = -1,
envstep: int = -1,
n_episode: Optional[int] = None
) -> Tuple[bool, float, list]:
) -> Tuple[bool, List[dict]]:
'''
Overview:
Evaluate policy and store the best policy based on whether it reaches the highest historical reward.
Expand All @@ -185,8 +185,7 @@ def eval(
- n_episode (:obj:`int`): Number of evaluation episodes.
Returns:
- stop_flag (:obj:`bool`): Whether this training program can be ended.
- eval_reward (:obj:`float`): Current eval_reward.
- return_info (:obj:`list`): Environment information of each finished episode
- return_info (:obj:`list`): Environment information of each finished episode.
'''
if n_episode is None:
n_episode = self._default_n_episode
Expand Down Expand Up @@ -273,7 +272,7 @@ def eval(
"Current eval_reward: {} is greater than stop_value: {}".format(eval_reward, self._stop_value) +
", so your RL agent is converged, you can refer to 'log/evaluator/evaluator_logger.txt' for details."
)
return stop_flag, eval_reward, return_info
return stop_flag, return_info


class VectorEvalMonitor(object):
Expand Down
8 changes: 5 additions & 3 deletions ding/worker/collector/interaction_serial_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def eval(
train_iter: int = -1,
envstep: int = -1,
n_episode: Optional[int] = None
) -> Tuple[bool, float]:
) -> Tuple[bool, dict]:
'''
Overview:
Evaluate policy and store the best policy based on whether it reaches the highest historical reward.
Expand All @@ -166,13 +166,14 @@ def eval(
- n_episode (:obj:`int`): Number of evaluation episodes.
Returns:
- stop_flag (:obj:`bool`): Whether this training program can be ended.
- eval_reward (:obj:`float`): Current eval_reward.
- return_info (:obj:`dict`): Current evaluation return information.
'''
if n_episode is None:
n_episode = self._default_n_episode
assert n_episode is not None, "please indicate eval n_episode"
envstep_count = 0
info = {}
return_info = []
eval_monitor = VectorEvalMonitor(self._env.env_num, n_episode)
self._env.reset()
self._policy.reset()
Expand All @@ -198,6 +199,7 @@ def eval(
if 'episode_info' in t.info:
eval_monitor.update_info(env_id, t.info['episode_info'])
eval_monitor.update_reward(env_id, reward)
return_info.append(t.info)
self._logger.info(
"[EVALUATOR]env {} finish episode, final reward: {}, current episode: {}".format(
env_id, eval_monitor.get_latest_reward(env_id), eval_monitor.get_current_episode()
Expand Down Expand Up @@ -245,4 +247,4 @@ def eval(
"Current eval_reward: {} is greater than stop_value: {}".format(eval_reward, self._stop_value) +
", so your RL agent is converged, you can refer to 'log/evaluator/evaluator_logger.txt' for details."
)
return stop_flag, eval_reward
return stop_flag, return_info
9 changes: 3 additions & 6 deletions dizoo/league_demo/league_demo_ppo_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,36 +170,33 @@ def load_checkpoint_fn(player_id: str, ckpt_path: str):
count = 0
while True:
if evaluator1.should_eval(main_learner.train_iter):
stop_flag1, reward, episode_info = evaluator1.eval(
stop_flag1, episode_info = evaluator1.eval(
main_learner.save_checkpoint, main_learner.train_iter, main_collector.envstep
)
win_loss_result = [e['result'] for e in episode_info[0]]
# set fixed NE policy trueskill(exposure) equal 10
main_player.rating = league.metric_env.rate_1vsC(
main_player.rating, league.metric_env.create_rating(mu=10, sigma=1e-8), win_loss_result
)
tb_logger.add_scalar('fixed_evaluator_step/reward_mean', reward, main_collector.envstep)

if evaluator2.should_eval(main_learner.train_iter):
stop_flag2, reward, episode_info = evaluator2.eval(
stop_flag2, episode_info = evaluator2.eval(
main_learner.save_checkpoint, main_learner.train_iter, main_collector.envstep
)
win_loss_result = [e['result'] for e in episode_info[0]]
# set random(uniform) policy trueskill(exposure) equal 0
main_player.rating = league.metric_env.rate_1vsC(
main_player.rating, league.metric_env.create_rating(mu=0, sigma=1e-8), win_loss_result
)
tb_logger.add_scalar('uniform_evaluator_step/reward_mean', reward, main_collector.envstep)
if evaluator3.should_eval(main_learner.train_iter):
_, reward, episode_info = evaluator3.eval(
_, episode_info = evaluator3.eval(
main_learner.save_checkpoint, main_learner.train_iter, main_collector.envstep
)
win_loss_result = [e['result'] for e in episode_info[0]]
# use init main player as another evaluator metric
main_player.rating, init_main_player_rating = league.metric_env.rate_1vs1(
main_player.rating, init_main_player_rating, win_loss_result
)
tb_logger.add_scalar('init_evaluator_step/reward_mean', reward, main_collector.envstep)
tb_logger.add_scalar(
'league/init_main_player_trueskill', init_main_player_rating.exposure, main_collector.envstep
)
Expand Down
6 changes: 2 additions & 4 deletions dizoo/league_demo/selfplay_demo_ppo_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,9 @@ def main(cfg, seed=0, max_train_iter=int(1e8), max_env_step=int(1e8)):

while True:
if evaluator1.should_eval(learner1.train_iter):
stop_flag1, reward, _ = evaluator1.eval(learner1.save_checkpoint, learner1.train_iter, collector.envstep)
tb_logger.add_scalar('fixed_evaluator_step/reward_mean', reward, collector.envstep)
stop_flag1, _ = evaluator1.eval(learner1.save_checkpoint, learner1.train_iter, collector.envstep)
if evaluator2.should_eval(learner1.train_iter):
stop_flag2, reward, _ = evaluator2.eval(learner1.save_checkpoint, learner1.train_iter, collector.envstep)
tb_logger.add_scalar('uniform_evaluator_step/reward_mean', reward, collector.envstep)
stop_flag2, _ = evaluator2.eval(learner1.save_checkpoint, learner1.train_iter, collector.envstep)
if stop_flag1 and stop_flag2:
break
train_data, _ = collector.collect(train_iter=learner1.train_iter)
Expand Down
31 changes: 8 additions & 23 deletions dizoo/slime_volley/config/slime_volley_league_ppo_config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from easydict import EasyDict

league_demo_ppo_config = dict(
exp_name="slime_volley_league_ppo",
exp_name="slime_volley_league_ppo_base",
env=dict(
collector_env_num=8,
evaluator_env_num=5,
Expand All @@ -23,43 +23,28 @@
),
learn=dict(
epoch_per_collect=5,
batch_size=64,
batch_size=256,
learning_rate=3e-4,
value_weight=0.5,
entropy_weight=0.0,
entropy_weight=0.005,
clip_ratio=0.2,
),
collect=dict(
n_episode=32,
n_episode=16,
unroll_len=1,
discount_factor=0.99,
gae_lambda=0.95,
),
other=dict(
league=dict(
player_category=['default'],
path_policy="slime_volley_league_ppo/policy",
active_players=dict(main_player=1,
# main_exploiter=1,
# league_exploiter=1,
),
path_policy="slime_volley_league_ppo_base/policy",
active_players=dict(main_player=1, ),
main_player=dict(
one_phase_step=2000,
branch_probs=dict(pfsp=1.0, ),
one_phase_step=20000,
branch_probs=dict(pfsp=0.2, sp=0.8),
strong_win_rate=0.7,
),
main_exploiter=dict(
PaParaZz1 marked this conversation as resolved.
Show resolved Hide resolved
one_phase_step=2000,
branch_probs=dict(main_players=1.0, ),
strong_win_rate=0.7,
min_valid_win_rate=0.3,
),
league_exploiter=dict(
one_phase_step=2000,
branch_probs=dict(pfsp=1.0, ),
strong_win_rate=0.7,
mutate_prob=0.5,
),
use_pretrain=False,
use_pretrain_init_historical=False,
payoff=dict(
Expand Down
13 changes: 10 additions & 3 deletions dizoo/slime_volley/entry/slime_volley_league_ppo_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,16 +126,20 @@ def load_checkpoint_fn(player_id: str, ckpt_path: str):
for player_id, player_ckpt_path in zip(league.active_players_ids, league.active_players_ckpts):
torch.save(policies[player_id].collect_mode.state_dict(), player_ckpt_path)
league.judge_snapshot(player_id, force=True)
init_main_player_rating = league.metric_env.create_rating(mu=0)

set_pkg_seed(seed, use_cuda=cfg.policy.cuda)

count = 0
while True:
if evaluator.should_eval(main_learner.train_iter):
stop_flag, reward = evaluator.eval(
stop_flag, eval_episode_info = evaluator.eval(
main_learner.save_checkpoint, main_learner.train_iter, main_collector.envstep
)
win_loss_result = [e['result'] for e in eval_episode_info]
# set eval bot rating as 100
main_player.rating = league.metric_env.rate_1vsC(
main_player.rating, league.metric_env.create_rating(mu=100, sigma=1e-8), win_loss_result
)
if stop_flag:
break
for player_id, player_ckpt_path in zip(league.active_players_ids, league.active_players_ckpts):
Expand Down Expand Up @@ -176,7 +180,10 @@ def load_checkpoint_fn(player_id: str, ckpt_path: str):
}
league.finish_job(job_finish_info)
if count % 10 == 0:
print(repr(league.payoff))
payoff_string = repr(league.payoff)
rank_string = league.player_rank(string=True)
tb_logger.add_text('payoff_step', payoff_string, main_collector.envstep)
tb_logger.add_text('rank_step', rank_string, main_collector.envstep)
count += 1


Expand Down