Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dqn nonframe state #142

Draft
wants to merge 32 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
48f345d
Merge pull request #1 from astooke/master
crizCraig Feb 27, 2020
2b7b3e0
Get R2D1 training pong on one machine
crizCraig Feb 29, 2020
f7b86c5
Add time to log dir name
crizCraig Feb 29, 2020
d6b66d8
Playing around
crizCraig Mar 9, 2020
f28331c
Add deepdrive_r2d1_model.py - untested
crizCraig Mar 10, 2020
3faa399
Add example_8.py
kargarisaac Mar 21, 2020
ab3a9a5
Add deepdrive ddpg
kargarisaac Mar 22, 2020
9e3b153
Train ddpg and add evaluation code
kargarisaac Mar 23, 2020
2977925
Add deepdrive dqn
kargarisaac Mar 24, 2020
f2d706d
Edit deepdrive dqn evaluation function
kargarisaac Mar 25, 2020
4cd16ed
Add resume from checkpoint code
kargarisaac Mar 25, 2020
5c4db0f
Train dqn- not completed
kargarisaac Mar 28, 2020
da62bde
Solve merge conflict with upstream/master
kargarisaac Mar 28, 2020
8d648cf
Add baseline dqn code and train it
kargarisaac Mar 28, 2020
66e9185
Add more examples. r2d1 and dqn not working yet
kargarisaac Mar 31, 2020
dcce9aa
Add pong+dqn using custom agent and model
kargarisaac Mar 31, 2020
16a7115
add log for sampling eps in each itr
kargarisaac Apr 1, 2020
e4d3000
Solve the problem of replay buffer for dqn agents with non-frame states
kargarisaac Apr 2, 2020
83287b2
Add setting replay buffer class using dqn object creation
kargarisaac Apr 3, 2020
61f68d5
Solve evaluation for dqn
kargarisaac Apr 3, 2020
eb222d6
Add r2d1 training and eval code for dd0 env
kargarisaac Apr 4, 2020
92d6168
Add evaluation func for cartpole env
kargarisaac Apr 4, 2020
a035062
Remove example_9 and renamed example_10 to 9
kargarisaac Apr 4, 2020
62b002d
Edit replay buffer initialization setting
kargarisaac Apr 4, 2020
63e861f
Train dd0-r2d1 with opponent
kargarisaac Apr 4, 2020
f9b01fe
Removed some unused imports
kargarisaac Apr 4, 2020
1a4443d
Remove some wrapper functions
kargarisaac Apr 5, 2020
2250ba4
Clean dd0-dqn and dd0-r2d1 models
kargarisaac Apr 5, 2020
65d415b
Edit r2d1-dd0 evaluate func
kargarisaac Apr 6, 2020
a51fea5
r2d1 training ...
kargarisaac Apr 7, 2020
59d3bad
dellete some files
kargarisaac Apr 7, 2020
8331446
Edit example_9
kargarisaac Apr 9, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion examples/atari_dqn_async_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@
from rlpyt.utils.launching.affinity import make_affinity
from rlpyt.samplers.async_.cpu_sampler import AsyncCpuSampler
from rlpyt.envs.atari.atari_env import AtariEnv, AtariTrajInfo
from rlpyt.envs.gym import make as gym_make
from rlpyt.algos.dqn.dqn import DQN
from rlpyt.agents.dqn.atari.atari_dqn_agent import AtariDqnAgent
from rlpyt.runners.async_rl import AsyncRlEval
from rlpyt.utils.logging.context import logger_context

import sys
sys.path.append('~/home/isaac/codes/dd-zero/rlpyt/examples')
from examples.example_9 import ResizeFrame, make_env_custom, CustomDqnAgent


def build_and_train(game="pong", run_ID=0):
# Change these inputs to match local machine and desired parallelism.
Expand Down Expand Up @@ -44,7 +49,9 @@ def build_and_train(game="pong", run_ID=0):
min_steps_learn=1e4,
replay_size=int(1e5)
)
agent = AtariDqnAgent()
# agent = AtariDqnAgent()
agent = CustomDqnAgent()

runner = AsyncRlEval(
algo=algo,
agent=agent,
Expand Down
2 changes: 2 additions & 0 deletions examples/example_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,5 @@ def build_and_train(env_id="Hopper-v3", run_ID=0, cuda_idx=None):
run_ID=args.run_ID,
cuda_idx=args.cuda_idx,
)


64 changes: 64 additions & 0 deletions examples/example_8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@

"""
Runs one instance of the environment and optimizes using the Soft Actor
Critic algorithm. Can use a GPU for the agent (applies to both sample and
train). No parallelism employed, everything happens in one python process; can
be easier to debug.

Requires OpenAI gym (and maybe mujoco). If not installed, move on to next
example.

"""

from rlpyt.samplers.serial.sampler import SerialSampler
from rlpyt.envs.gym import make as gym_make
from rlpyt.algos.qpg.sac import SAC
from rlpyt.algos.qpg.ddpg import DDPG
from rlpyt.agents.qpg.sac_agent import SacAgent
from rlpyt.agents.qpg.ddpg_agent import DdpgAgent
from rlpyt.runners.minibatch_rl import MinibatchRlEval
from rlpyt.utils.logging.context import logger_context


def build_and_train(env_id="LunarLanderContinuous-v2", run_ID=0, cuda_idx=None):
sampler = SerialSampler(
EnvCls=gym_make,
env_kwargs=dict(id=env_id),
eval_env_kwargs=dict(id=env_id),
batch_T=1, # One time-step per sampler iteration.
batch_B=1, # One environment (i.e. sampler Batch dimension).
max_decorrelation_steps=0,
eval_n_envs=10,
eval_max_steps=int(51e3),
eval_max_trajectories=50,
)
algo = DDPG() #SAC() # Run with defaults.
agent = DdpgAgent() #SacAgent()
runner = MinibatchRlEval(
algo=algo,
agent=agent,
sampler=sampler,
n_steps=1e6,
log_interval_steps=1e4,
affinity=dict(cuda_idx=cuda_idx),
)
config = dict(env_id=env_id)
name = "ddpg_" + env_id
log_dir = "example_8"
with logger_context(log_dir, run_ID, name, config):
runner.train()


if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# for gym_make you have to have an env with Box observation and action space. It doesn't work for Discrete. Only work for continues action spaces ->deepdrive-zero: should be ok1
parser.add_argument('--env_id', help='environment ID', default='LunarLanderContinuous-v2') # 'BipedalWalkerHardcore-v3':ok, 'LunarLanderContinuous-v2':ok
parser.add_argument('--run_ID', help='run identifier (logging)', type=int, default=0)
parser.add_argument('--cuda_idx', help='gpu to use ', type=int, default=None)
args = parser.parse_args()
build_and_train(
env_id=args.env_id,
run_ID=args.run_ID,
cuda_idx=args.cuda_idx,
)
195 changes: 195 additions & 0 deletions examples/example_9.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@

from rlpyt.utils.launching.affinity import make_affinity
from rlpyt.samplers.parallel.cpu.sampler import CpuSampler
from rlpyt.samplers.parallel.gpu.sampler import GpuSampler
from rlpyt.samplers.serial.sampler import SerialSampler
from rlpyt.samplers.async_.cpu_sampler import AsyncCpuSampler
from rlpyt.algos.dqn.dqn import DQN
from rlpyt.agents.dqn.dqn_agent import DqnAgent
from rlpyt.agents.dqn.atari.atari_dqn_agent import AtariDqnAgent
from rlpyt.runners.minibatch_rl import MinibatchRlEval, MinibatchRl
from rlpyt.utils.logging.context import logger_context
from rlpyt.envs.gym import GymEnvWrapper
from rlpyt.runners.async_rl import AsyncRlEval
from rlpyt.utils.tensor import infer_leading_dims, restore_leading_dims
from rlpyt.utils.wrappers import *
from rlpyt.envs.gym import make as make_env
from rlpyt.replays.non_sequence.uniform import UniformReplayBuffer
from rlpyt.envs.base import EnvSpaces
from rlpyt.utils.buffer import buffer_to

import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import gym



############################# classes and functions #############################

class CustomMixin:
def make_env_to_model_kwargs(self, env_spaces):
return dict(observation_shape=env_spaces.observation.shape,
output_size=env_spaces.action.n)


class CustomDqnModel(torch.nn.Module):
def __init__(
self,
observation_shape,
output_size,
fc_sizes=64
):
super().__init__()
self._obs_ndim = len(observation_shape)
input_shape = observation_shape[0]

self.base_net = torch.nn.Sequential(
torch.nn.Linear(input_shape, fc_sizes),
torch.nn.ReLU(),
torch.nn.Linear(fc_sizes, fc_sizes),
torch.nn.ReLU(),
torch.nn.Linear(fc_sizes, output_size),
)
# self.base_net.apply(self.init_weights)

def forward(self, observation, prev_action, prev_reward):
observation = observation.type(torch.float)
lead_dim, T, B, obs_shape = infer_leading_dims(observation, self._obs_ndim)
obs = observation.view(T * B, -1)
q = self.base_net(obs)
q = restore_leading_dims(q, lead_dim, T, B)
return q

def init_weights(self, m):
if type(m) == torch.nn.Linear:
torch.nn.init.normal_(m.weight)
torch.nn.init.zeros_(m.bias)


# class CustomDqnAgent(CustomMixin, DqnAgent):
class CustomDqnAgent(CustomMixin, DqnAgent):
def __init__(self, ModelCls=CustomDqnModel, **kwargs):
super().__init__(ModelCls=ModelCls, **kwargs)

@torch.no_grad()
def eval_step(self, observation, prev_action, prev_reward):
"""Computes Q-values for states/observations and selects actions by
epsilon-greedy. (no grad)"""
# prev_action = self.distribution.to_onehot(prev_action)
model_inputs = buffer_to((observation, prev_action, prev_reward),
device=self.device)
q = self.model(*model_inputs)
q = q.cpu()
action = torch.argmax(q)
return action


def make_env_custom(*args, **kwargs):
env = gym.make('CartPole-v0')
env = GymEnvWrapper(env)
return env


def build_and_train(run_ID=0, cuda_idx=None):
env_id = 'CartPole-v0'

sampler = CpuSampler(
EnvCls=make_env,
env_kwargs=dict(id=env_id), #env_config,
eval_env_kwargs=dict(id=env_id), #env_config,
batch_T=4, # One time-step per sampler iteration.
batch_B=8, # One environment (i.e. sampler Batch dimension).
max_decorrelation_steps=100,
eval_n_envs=2,
eval_max_steps=int(10e3),
eval_max_trajectories=4,
)

algo = DQN(
learning_rate=1e-3,
replay_ratio=8,
batch_size=32,
min_steps_learn=32,
eps_steps=10e3,
replay_size=int(1e3),
# double_dqn=True,
# target_update_interval=1,
# prioritized_replay=True,
frame_state_space=False,
)

agent = CustomDqnAgent()

runner = MinibatchRl(
algo=algo,
agent=agent,
sampler=sampler,
n_steps=1e6,
log_interval_steps=1e2,
affinity=dict(cuda_idx=cuda_idx, workers_cpus=[0, 1, 2, 4, 5, 6])
)

config = dict(env_id=env_id)
algo_name = 'dqn_'
name = algo_name + env_id
log_dir = algo_name + env_id

with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'):
runner.train()


def evaluate():
import time
pre_trained_model = '/home/isaac/codes/dd-zero/rlpyt/data/local/2020_04-04_06-52.20/dqn_CartPole-v0/run_0/itr_24713.pkl'
data = torch.load(pre_trained_model)
agent_state_dict = data['agent_state_dict']

# for loading pre-trained models see: https://github.com/astooke/rlpyt/issues/69
env = gym.make('CartPole-v0')

agent = CustomDqnAgent(initial_model_state_dict=agent_state_dict['model'])

env_spaces = EnvSpaces(
observation=env.observation_space,
action=env.action_space,
)
agent.initialize(env_spaces)
agent.load_state_dict(agent_state_dict['model'])

obs = env.reset()
tot_reward = 0
while True:
# action = agent.step(torch.tensor(obs, dtype=torch.float32), torch.tensor(0), torch.tensor(0))
action = agent.eval_step(torch.tensor(obs, dtype=torch.float32), None, None)
a = np.array(action)
obs, reward, done, info = env.step(a)
tot_reward += reward
env.render()
time.sleep(0.001)
if done:
break

print('reward: ', tot_reward)
env.close()


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--run_ID', help='run identifier (logging)', type=int, default=0)
parser.add_argument('--cuda_idx', help='gpu to use ', type=int, default=0)
parser.add_argument('--mode', help='train or eval', default='eval')

args = parser.parse_args()

if args.mode == 'train':
build_and_train(
run_ID=args.run_ID,
cuda_idx=args.cuda_idx,
)
else:
evaluate()
10 changes: 10 additions & 0 deletions rlpyt/agents/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
from rlpyt.utils.logging import logger
from rlpyt.models.utils import strip_ddp_state_dict

from rlpyt.spaces.int_box import IntBox
from rlpyt.spaces.float_box import FloatBox

AgentInputs = namedarraytuple("AgentInputs",
["observation", "prev_action", "prev_reward"])
AgentStep = namedarraytuple("AgentStep", ["action", "agent_info"])
Expand Down Expand Up @@ -56,6 +59,9 @@ def __init__(self, ModelCls=None, model_kwargs=None, initial_model_state_dict=No
self._send_count = mp.RawValue("l", 0)
self._recv_count = 0

## TODO: add fram idx counter for e-greedy action selection
self.frame_idx = 0

def __call__(self, observation, prev_action, prev_reward):
"""Returns values from model forward pass on training data (i.e. used
in algorithm)."""
Expand All @@ -79,6 +85,10 @@ def initialize(self, env_spaces, share_memory=False, **kwargs):
env_spaces: passed to ``make_env_to_model_kwargs()``, typically namedtuple of 'observation' and 'action'.
share_memory (bool): whether to use shared memory for model parameters.
"""

# FloatBox(env_spaces.observation.low, env_spaces.observation.high)
# FloatBox(env_spaces.observation.low, env_spaces.observation.high)

self.env_model_kwargs = self.make_env_to_model_kwargs(env_spaces)
self.model = self.ModelCls(**self.env_model_kwargs,
**self.model_kwargs)
Expand Down
1 change: 0 additions & 1 deletion rlpyt/agents/dqn/atari/atari_dqn_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from rlpyt.models.dqn.atari_dqn_model import AtariDqnModel
from rlpyt.agents.dqn.atari.mixin import AtariMixin


class AtariDqnAgent(AtariMixin, DqnAgent):

def __init__(self, ModelCls=AtariDqnModel, **kwargs):
Expand Down
5 changes: 5 additions & 0 deletions rlpyt/agents/dqn/dqn_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.nn.parallel import DistributedDataParallelCPU as DDPC


from rlpyt.agents.base import BaseAgent, AgentStep
from rlpyt.agents.dqn.epsilon_greedy import EpsilonGreedyAgentMixin
from rlpyt.distributions.epsilon_greedy import EpsilonGreedy
Expand Down Expand Up @@ -42,6 +43,8 @@ def initialize(self, env_spaces, share_memory=False,
if env_ranks is not None:
self.make_vec_eps(global_B, env_ranks)

self.env_spaces = env_spaces

def to_device(self, cuda_idx=None):
super().to_device(cuda_idx)
self.target_model.to(self.device)
Expand All @@ -60,6 +63,7 @@ def step(self, observation, prev_action, prev_reward):
q = self.model(*model_inputs)
q = q.cpu()
action = self.distribution.sample(q)

agent_info = AgentInfo(q=q)
# action, agent_info = buffer_to((action, agent_info), device="cpu")
return AgentStep(action=action, agent_info=agent_info)
Expand All @@ -75,3 +79,4 @@ def target(self, observation, prev_action, prev_reward):
def update_target(self, tau=1):
"""Copies the model parameters into the target model."""
update_state_dict(self.target_model, self.model.state_dict(), tau)

2 changes: 2 additions & 0 deletions rlpyt/agents/dqn/epsilon_greedy.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def __init__(
self._eps_itr_min_max[0] = eps_itr_min
self._eps_itr_min_max[1] = eps_itr_max

#TODO: counter for time-steps

def collector_initialize(self, global_B=1, env_ranks=None):
"""For vector-valued epsilon, the agent inside the sampler worker process
must initialize with its own epsilon values."""
Expand Down
Loading