From 63c7972c20ca2073d37166cce2463a5c026790bd Mon Sep 17 00:00:00 2001 From: Nemo Fournier Date: Fri, 31 May 2019 14:15:37 +0200 Subject: [PATCH] Initial commit --- .gitignore | 2 + actor.py | 101 +++++++++++++++++++++++++++++++++++++ agent.py | 80 ++++++++++++++++++++++++++++++ critic.py | 126 +++++++++++++++++++++++++++++++++++++++++++++++ environment.py | 24 +++++++++ main.py | 94 +++++++++++++++++++++++++++++++++++ replay_buffer.py | 54 ++++++++++++++++++++ 7 files changed, 481 insertions(+) create mode 100644 .gitignore create mode 100644 actor.py create mode 100644 agent.py create mode 100644 critic.py create mode 100644 environment.py create mode 100644 main.py create mode 100644 replay_buffer.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bc6026d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__/* +logs diff --git a/actor.py b/actor.py new file mode 100644 index 0000000..17e9a77 --- /dev/null +++ b/actor.py @@ -0,0 +1,101 @@ +import tensorflow as tf +import tflearn + +UNITS = 128 +MAX_STEPS = 100 + +class Actor: + def __init__(self, session, dim_state, dim_goal, dim_action, env, tau, learning_rate, batch_size): + self._sess = session + + self._dim_state = dim_state + self._dim_action = dim_action + self._dim_goal = dim_goal + self._action_bound = env.action_space.high + self._internal_memory = [] + self._tau = tau + self._learning_rate = learning_rate + self._batch_size = batch_size + + self._net_inputs, self._net_out, self._net_scaled_out = self.create_network() + self._net_input_state, self._net_input_goal, self._net_input_history = self._net_inputs + + self._network_params = tf.trainable_variables() + + self._target_inputs, self._target_out, self._target_scaled_out = self.create_network() + self._target_input_state, self._target_input_goal, self._target_input_history = self._target_inputs + + self._target_network_params = tf.trainable_variables()[len(self._network_params):] + + # Op for periodically updating target network with online network + # weights + self._update_target_network_params = [self._target_network_params[i].assign(tf.multiply(self._network_params[i], self._tau) + tf.multiply(self._target_network_params[i], 1. - self._tau)) for i in range(len(self._target_network_params))] + + # This gradient will be provided by the critic network + self._action_gradient = tf.placeholder(tf.float32, [None, self._dim_action]) + + # Combine the gradients here + self._unnormalized_actor_gradients = tf.gradients(self._net_scaled_out, self._network_params, -self._action_gradient) + self._actor_gradients = list(map(lambda x: tf.div(x, self._batch_size), self._unnormalized_actor_gradients)) + + # Optimization Op + self._optimize = tf.train.AdamOptimizer(self._learning_rate).apply_gradients(zip(self._actor_gradients, self._network_params)) + + self._num_trainable_vars = len(self._network_params) + len(self._target_network_params) + + def create_network(self): + input_state = tflearn.input_data(shape=[None, self._dim_state], name='input_state') + input_goal = tflearn.input_data(shape=[None, self._dim_goal], name='input_goal') + + input_memory = tflearn.input_data(shape=[None, MAX_STEPS, self._dim_state + self._dim_action]) + + input_ff = tflearn.merge([input_goal, input_state], 'concat') + + ff_branch = tflearn.fully_connected(input_ff, UNITS) + ff_branch = tflearn.activations.relu(ff_branch) + + # recurrent_branch = tflearn.fully_connected(input_memory, UNITS) + # recurrent_branch = tflearn.activations.relu(recurrent_branch) + recurrent_branch = tflearn.lstm(input_memory, UNITS, dynamic=True) + + merged_branch = tflearn.merge([ff_branch, recurrent_branch], 'concat') + merged_branch = tflearn.fully_connected(merged_branch, UNITS) + merged_branch = tflearn.activations.relu(merged_branch) + + merged_branch = tflearn.fully_connected(merged_branch, UNITS) + merged_branch = tflearn.activations.relu(merged_branch) + + weights_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003) + out = tflearn.fully_connected( + merged_branch, self._dim_action, activation='tanh', weights_init=weights_init) + # Scale output to -action_bound to action_bound + scaled_out = tf.multiply(out, self._action_bound) + return [input_state, input_goal, input_memory], out, scaled_out + + def train(self, input_state, input_goal, input_history, a_gradient): + self._sess.run(self._optimize, feed_dict={ + self._net_input_state: input_state, + self._net_input_goal: input_goal, + self._net_input_history: input_history, + self._action_gradient: a_gradient + }) + + def predict(self, input_state, input_goal, input_history): + return self._sess.run(self._net_scaled_out, feed_dict={ + self._net_input_state: input_state, + self._net_input_goal: input_goal, + self._net_input_history: input_history, + }) + + def predict_target(self, input_state, input_goal, input_history): + return self._sess.run(self._target_scaled_out, feed_dict={ + self._target_input_state: input_state, + self._target_input_goal: input_goal, + self._target_input_history: input_history, + }) + + def update_target_network(self): + self._sess.run(self._update_target_network_params) + + def get_num_trainable_vars(self): + return self._num_trainable_vars diff --git a/agent.py b/agent.py new file mode 100644 index 0000000..b432189 --- /dev/null +++ b/agent.py @@ -0,0 +1,80 @@ +import gym +import tensorflow as tf +import numpy as np + +from actor import Actor +from critic import Critic + +MAX_STEPS = 100 +TAU = 1e-3 +LEARNING_RATE = 5e-4 + +class Agent: + def __init__(self, experiment, batch_size): + self._dummy_env = gym.make(experiment) + self._sess = tf.Session() + + # Hardcoded for now + self._dim_state = 10 + self._dim_goal = 3 + self._dim_action = self._dummy_env.action_space.shape[0] + self._dim_env = 1 + self._batch_size = batch_size + + self._actor = Actor(self._sess, + self._dim_state, self._dim_goal, self._dim_action, self._dummy_env, TAU, LEARNING_RATE, self._batch_size) + + self._critic = Critic(self._sess, + self._dim_state, self._dim_goal, self._dim_action, self._dim_env, self._dummy_env, TAU, LEARNING_RATE, self._actor.get_num_trainable_vars()) + + self._sess.run(tf.global_variables_initializer()) + + self._actor.update_target_network() + self._critic.update_target_network() + + #loss_summary = tf.summary.scalar('loss', self._critic._loss) + #writer = tf.summary.FileWriter('logs/') + #writer.add_summary( + #writer.add_graph(tf.get_default_graph()) + #writer.flush() + + def evaluate_actor(self, actor_predict, obs, goal, history): + + assert (history.shape[0] == MAX_STEPS), "history must be of size MAX_STEPS" + + obs = obs.reshape(1, self._dim_state) + goal = goal.reshape(1, self._dim_goal) + history = history.reshape(1, history.shape[0], history.shape[1]) + + return actor_predict(obs, goal, history) + + def evaluate_actor_batch(self, actor_predict, obs, goal, history): + + return actor_predict(obs, goal, history) + + def evaluate_critic(self, critic_predict, obs, action, goal, history, env): + obs = obs.reshape(1, self._dim_state) + goal = goal.reshape(1, self._dim_goal) + action = action.reshape(1, self._dim_action) + history = history.reshape(1, history.shape[0], history.shape[1]) + env = env.reshape(1, self._dim_env) + + return critic_predict(env, obs, goal, action, history) + + def evaluate_critic_batch(self, critic_predict, obs, action, goal, history, env): + return critic_predict(env, obs, goal, action, history) + + def train_critic(self, obs, action, goal, history, env, predicted_q_value): + return self._critic.train(env, obs, goal, action, history, predicted_q_value) + + def train_actor(self, obs, goal, history, a_gradient): + return self._actor.train(obs, goal, history, a_gradient) + + def action_gradients_critic(self, obs, action, goal, history, env): + return self._critic.action_gradients(env, obs, goal, action, history) + + def update_target_actor(self): + self._actor.update_target_network() + + def update_target_critic(self): + self._critic.update_target_network() diff --git a/critic.py b/critic.py new file mode 100644 index 0000000..aa29144 --- /dev/null +++ b/critic.py @@ -0,0 +1,126 @@ +import tensorflow as tf +import tflearn + +UNITS = 128 +MAX_STEPS = 100 + +class Critic: + def __init__(self, session, dim_state, dim_goal, dim_action, dim_env, env, tau, learning_rate, num_actor_vars): + self._sess = session + + self._dim_state = dim_state + self._dim_action = dim_action + self._dim_env = dim_env + self._dim_goal = dim_goal + self._action_bound = env.action_space.high + + self._learning_rate = learning_rate + self._tau = tau + + + self._net_inputs, self._net_out = self.create_network() + + self._net_input_env, self._net_input_goal, self._net_input_action, self._net_input_state, self._net_input_history = self._net_inputs + + self._network_params = tf.trainable_variables()[num_actor_vars:] + + self._target_inputs, self._target_out = self.create_network() + + self._target_input_env, self._target_input_goal, self._target_input_action, self._target_input_state, self._target_input_history = self._target_inputs + + self._target_network_params = tf.trainable_variables()[(len(self._network_params) + num_actor_vars):] + + # Op for periodically updating target network with online network + # weights with regularization + self._update_target_network_params = \ + [self._target_network_params[i].assign(tf.multiply(self._network_params[i], self._tau) \ + + tf.multiply(self._target_network_params[i], 1. - self._tau)) + for i in range(len(self._target_network_params))] + + # Network target (y_i) + self._predicted_q_value = tf.placeholder(tf.float32, [None, 1]) + + # Define loss and optimization Op + self._loss = tflearn.mean_square(self._predicted_q_value, self._net_out) + self._optimize = tf.train.AdamOptimizer( + self._learning_rate).minimize(self._loss) + + # Get the gradient of the net w.r.t. the action. + # For each action in the minibatch (i.e., for each x in xs), + # this will sum up the gradients of each critic output in the minibatch + # w.r.t. that action. Each output is independent of all + # actions except for one. + self._action_grads = tf.gradients(self._net_out, self._net_input_action) + + def create_network(self): + input_state = tflearn.input_data(shape=[None, self._dim_state]) + input_goal = tflearn.input_data(shape=[None, self._dim_goal]) + input_action = tflearn.input_data(shape=[None, self._dim_action]) + input_env = tflearn.input_data(shape=[None, self._dim_env]) + + input_history = tflearn.input_data(shape=[None, MAX_STEPS, self._dim_action + self._dim_state]) + + input_ff = tflearn.merge( + [input_env, input_goal, input_action, input_state], 'concat') + + ff_branch = tflearn.fully_connected(input_ff, UNITS) + ff_branch = tflearn.activations.relu(ff_branch) + + #recurrent_branch = tflearn.fully_connected(inputs, UNITS) + #recurrent_branch = tflearn.activations.relu(recurrent_branch) + recurrent_branch = tflearn.lstm(input_history, UNITS, dynamic=True) + + merged_branch = tflearn.merge([ff_branch, recurrent_branch], 'concat') + merged_branch = tflearn.fully_connected(merged_branch, UNITS) + merged_branch = tflearn.activations.relu(merged_branch) + + merged_branch = tflearn.fully_connected(merged_branch, UNITS) + merged_branch = tflearn.activations.relu(merged_branch) + + weights_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003) + out = tflearn.fully_connected( + merged_branch, 1, activation='linear', weights_init=weights_init) + + return [input_env, input_goal, input_action, input_state, input_history], out + + + def train(self, input_env, input_state, input_goal, input_action, input_history, predicted_q_value): + return self._sess.run([self._net_out, self._optimize], feed_dict={ + self._net_input_env: input_env, + self._net_input_state: input_state, + self._net_input_goal: input_goal, + self._net_input_action: input_action, + self._net_input_history: input_history, + + self._predicted_q_value: predicted_q_value + }) + + def predict(self, input_env, input_state, input_goal, input_action, input_history): + return self._sess.run(self._net_out, feed_dict={ + self._net_input_env: input_env, + self._net_input_state: input_state, + self._net_input_goal: input_goal, + self._net_input_action: input_action, + self._net_input_history: input_history, + }) + + def predict_target(self, input_env, input_state, input_goal, input_action, input_history): + return self._sess.run(self._target_out, feed_dict={ + self._target_input_env: input_env, + self._target_input_state: input_state, + self._target_input_goal: input_goal, + self._target_input_action: input_action, + self._target_input_history: input_history, + }) + + def action_gradients(self, input_env, input_state, input_goal, input_action, input_history): + return self._sess.run(self._action_grads, feed_dict={ + self._net_input_env: input_env, + self._net_input_state: input_state, + self._net_input_goal: input_goal, + self._net_input_action: input_action, + self._net_input_history: input_history + }) + + def update_target_network(self): + self._sess.run(self._update_target_network_params) diff --git a/environment.py b/environment.py new file mode 100644 index 0000000..e958e21 --- /dev/null +++ b/environment.py @@ -0,0 +1,24 @@ +import gym + +class RandomizedEnvironment: + """ Randomized environment class """ + def __init__(self, experiment, parameter_ranges, goal_range): + self._experiment = experiment + self._parameter_ranges = parameter_ranges + self._goal_range = goal_range + self._params = [0] + + def sample_env(self): + self._params = [0] + self._env = gym.make(self._experiment) + self._env.env.reward_type="dense" + def get_env(self): + """ + Returns a randomized environment and the vector of the parameter + space that corresponds to this very instance + """ + return self._env, self._params + + def get_goal(self): + return + diff --git a/main.py b/main.py new file mode 100644 index 0000000..6c0cbe2 --- /dev/null +++ b/main.py @@ -0,0 +1,94 @@ +import numpy as np +import gym + +from environment import RandomizedEnvironment +from agent import Agent +from replay_buffer import ReplayBuffer + +EPISODES = 1000 + +experiment = "FetchReach-v1" +env = gym.make(experiment) + +# Initialize networks +BATCH_SIZE = 128 +BUFFER_SIZE = 100000 +MAX_STEPS = 100 +GAMMA = 0.99 + +agent = Agent(experiment, BATCH_SIZE) +randomized_environment = RandomizedEnvironment(experiment, [], []) + +replay_buffer = ReplayBuffer(BUFFER_SIZE) + +dim_history_atom = agent._dim_state + agent._dim_action + +randomized_environment.sample_env() +env, env_params = randomized_environment.get_env() + +success = 0 + +for episode in range(EPISODES): + history = np.array(MAX_STEPS*[np.zeros(dim_history_atom)]) + # generate a rollout + + obs_dict = env.reset() + last_action = env.action_space.sample() # fake last_action, to feed the network + + obs = obs_dict['observation'] + history = np.append(history, [np.concatenate((last_action, obs))], axis = 0)[1:] + + done = False + + print("Episode : {}".format(episode)) + tot_rew = 0 + + while not done: + obs = obs_dict['observation'] + goal = obs_dict['desired_goal'] + action = agent.evaluate_actor(agent._actor.predict, obs, goal, history) + + if(episode > 600): + env.render() + + new_obs_dict, step_reward, done, info = env.step(action[0]) + tot_rew += step_reward + new_obs = new_obs_dict['observation'] + + history = np.append(history, [np.concatenate((action[0], obs))], axis = 0)[1:] + + replay_buffer.add(obs, action, step_reward, done, new_obs, history, env_params, goal) + + obs = new_obs + + if done and info['is_success'] > 0.01: + success += 1 + print("Success ! {}/{} ({})".format(success, episode, success/episode)) + + if replay_buffer.size() > BATCH_SIZE and done: + s_batch, a_batch, r_batch, t_batch, s2_batch, history_batch, env_batch, goal_batch = replay_buffer.sample_batch(BATCH_SIZE) + + target_action_batch = agent.evaluate_actor_batch(agent._actor.predict_target, s2_batch, goal_batch, history_batch) + + predicted_actions = agent.evaluate_actor_batch(agent._actor.predict, s2_batch, goal_batch, history_batch) + target_q = agent.evaluate_critic_batch(agent._critic.predict_target, s2_batch, predicted_actions, goal_batch, history_batch, env_batch) + + y_i = [] + for k in range(BATCH_SIZE): + if t_batch[k]: + y_i.append(r_batch[k]) + else: + y_i.append(r_batch[k] + GAMMA * target_q[k]) + + predicted_q_value, _ = agent.train_critic(s_batch, a_batch, goal_batch, history_batch, env_batch, np.reshape(y_i, (BATCH_SIZE, 1))) + + + # Update the actor policy using the sampled gradient + a_outs = agent.evaluate_actor_batch(agent._actor.predict, s_batch, goal_batch, history_batch) + grads = agent.action_gradients_critic(s_batch, a_outs, goal_batch, history_batch, env_batch) + agent.train_actor(s_batch, goal_batch, history_batch, grads[0]) + + # Update target networks + agent.update_target_actor() + agent.update_target_critic() + print("Reward : {}".format(tot_rew)) diff --git a/replay_buffer.py b/replay_buffer.py new file mode 100644 index 0000000..1c8de0c --- /dev/null +++ b/replay_buffer.py @@ -0,0 +1,54 @@ +""" +Data structure for implementing experience replay + +Author: Patrick Emami +""" +from collections import deque +import random +import numpy as np + +class ReplayBuffer(object): + + def __init__(self, buffer_size, random_seed=123): + """ + The right side of the deque contains the most recent experiences + """ + self.buffer_size = buffer_size + self.count = 0 + self.buffer = deque() + random.seed(random_seed) + + def add(self, s, a, r, t, s2, history, env, goal): + experience = (s, a, r, t, s2, history, env, goal) + if self.count < self.buffer_size: + self.buffer.append(experience) + self.count += 1 + else: + self.buffer.popleft() + self.buffer.append(experience) + + def size(self): + return self.count + + def sample_batch(self, batch_size): + batch = [] + + if self.count < batch_size: + batch = random.sample(self.buffer, self.count) + else: + batch = random.sample(self.buffer, batch_size) + + s_batch = np.array([_[0] for _ in batch]) + a_batch = np.array([_[1].reshape(4) for _ in batch]) + r_batch = np.array([_[2] for _ in batch]) + t_batch = np.array([_[3] for _ in batch]) + s2_batch = np.array([_[4] for _ in batch]) + history_batch = np.array([_[5] for _ in batch]) + env_batch = np.array([_[6] for _ in batch]) + goal_batch = np.array([_[7] for _ in batch]) + + return s_batch, a_batch, r_batch, t_batch, s2_batch, history_batch, env_batch, goal_batch + + def clear(self): + self.buffer.clear() + self.count = 0