Skip to content

Commit 877cefc

Browse files
committed
1 parent b5bd5bf commit 877cefc

File tree

2 files changed

+12
-5
lines changed

2 files changed

+12
-5
lines changed

a3c.py

+2
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ def act(self, state, reward, is_state_terminal):
103103
else:
104104
v_loss += (v - R) ** 2 / 2
105105

106+
pi_loss *= 0.5
107+
106108
# Do we need to normalize losses by (self.t - self.t_start)?
107109
# Otherwise, loss scales can be different in case of self.t_max
108110
# and in case of termination.

a3c_ale.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,11 @@ def main():
6767
parser.add_argument('--seed', type=int, default=None)
6868
parser.add_argument('--outdir', type=str, default=None)
6969
parser.add_argument('--use-sdl', action='store_true')
70-
parser.add_argument('--t-max', type=int, default=5)
70+
parser.add_argument('--t-max', type=int, default=20)
7171
parser.add_argument('--beta', type=float, default=1e-2)
7272
parser.add_argument('--profile', action='store_true')
7373
parser.add_argument('--steps', type=int, default=10 ** 7)
74+
parser.add_argument('--lr', type=float, default=7e-4)
7475
parser.set_defaults(use_sdl=False)
7576
args = parser.parse_args()
7677

@@ -99,14 +100,15 @@ def agent_func(process_idx):
99100
np.random.uniform(-3e-4, 3e-4, size=param.data.shape)
100101

101102
# opt = optimizers.RMSprop(lr=1e-3)
102-
opt = rmsprop_ones.RMSpropOnes(lr=1e-3, eps=1e-2, alpha=0.999)
103+
opt = rmsprop_ones.RMSpropOnes(lr=7e-4, eps=1e-2, alpha=0.99)
103104
# opt = rmsprop_ones.RMSpropOnes(lr=1e-4, eps=1e-1)
104105
# opt = optimizers.RMSpropGraves(
105106
# lr=2.5e-4, alpha=0.95, momentum=0.95, eps=1e-2)
106107
model = chainer.ChainList(pi, v)
107108
opt.setup(model)
108-
opt.add_hook(chainer.optimizer.GradientClipping(2))
109-
return a3c.A3C(model, opt, args.t_max, 0.99, beta=args.beta, process_idx=process_idx, phi=phi)
109+
opt.add_hook(chainer.optimizer.GradientClipping(40))
110+
return a3c.A3C(model, opt, args.t_max, 0.99, beta=args.beta,
111+
process_idx=process_idx, phi=phi)
110112

111113
def env_func(process_idx):
112114
return ale.ALE(args.rom, use_sdl=args.use_sdl)
@@ -119,14 +121,17 @@ def run_func(process_idx, agent, env):
119121
try:
120122
for i in range(args.steps):
121123

124+
agent.optimizer.lr = (args.steps - i) / args.steps * args.lr
125+
122126
total_r += env.reward
123127
episode_r += env.reward
124128

125129
action = agent.act(env.state, env.reward, env.is_terminal)
126130

127131
if env.is_terminal:
128132
if process_idx == 0:
129-
print('{} i:{} episode_r:{}'.format(outdir, i, episode_r))
133+
print('{} i:{} lr:{} episode_r:{}'.format(
134+
outdir, i, agent.optimizer.lr, episode_r))
130135
with open(os.path.join(outdir, 'scores.txt'), 'a+') as f:
131136
print(i, episode_r, file=f)
132137
if max_score == None or episode_r > max_score:

0 commit comments

Comments
 (0)