From 63c7972c20ca2073d37166cce2463a5c026790bd Mon Sep 17 00:00:00 2001
From: Nemo Fournier <nemo.fournier@ens-lyon.org>
Date: Fri, 31 May 2019 14:15:37 +0200
Subject: [PATCH] Initial commit

---
 .gitignore       |   2 +
 actor.py         | 101 +++++++++++++++++++++++++++++++++++++
 agent.py         |  80 ++++++++++++++++++++++++++++++
 critic.py        | 126 +++++++++++++++++++++++++++++++++++++++++++++++
 environment.py   |  24 +++++++++
 main.py          |  94 +++++++++++++++++++++++++++++++++++
 replay_buffer.py |  54 ++++++++++++++++++++
 7 files changed, 481 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 actor.py
 create mode 100644 agent.py
 create mode 100644 critic.py
 create mode 100644 environment.py
 create mode 100644 main.py
 create mode 100644 replay_buffer.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bc6026d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/*
+logs
diff --git a/actor.py b/actor.py
new file mode 100644
index 0000000..17e9a77
--- /dev/null
+++ b/actor.py
@@ -0,0 +1,101 @@
+import tensorflow as tf
+import tflearn
+
+UNITS = 128
+MAX_STEPS = 100
+
+class Actor:
+    def __init__(self, session, dim_state, dim_goal, dim_action, env, tau, learning_rate, batch_size):
+        self._sess = session
+        
+        self._dim_state = dim_state
+        self._dim_action = dim_action
+        self._dim_goal = dim_goal
+        self._action_bound = env.action_space.high
+        self._internal_memory = []
+        self._tau = tau
+        self._learning_rate = learning_rate
+        self._batch_size = batch_size
+
+        self._net_inputs, self._net_out, self._net_scaled_out = self.create_network()
+        self._net_input_state, self._net_input_goal, self._net_input_history = self._net_inputs
+
+        self._network_params = tf.trainable_variables()
+
+        self._target_inputs, self._target_out, self._target_scaled_out = self.create_network()
+        self._target_input_state, self._target_input_goal, self._target_input_history = self._target_inputs
+
+        self._target_network_params = tf.trainable_variables()[len(self._network_params):]
+
+        # Op for periodically updating target network with online network
+        # weights
+        self._update_target_network_params = [self._target_network_params[i].assign(tf.multiply(self._network_params[i], self._tau) + tf.multiply(self._target_network_params[i], 1. - self._tau)) for i in range(len(self._target_network_params))]
+
+        # This gradient will be provided by the critic network
+        self._action_gradient = tf.placeholder(tf.float32, [None, self._dim_action])
+
+        # Combine the gradients here
+        self._unnormalized_actor_gradients = tf.gradients(self._net_scaled_out, self._network_params, -self._action_gradient)
+        self._actor_gradients = list(map(lambda x: tf.div(x, self._batch_size), self._unnormalized_actor_gradients))
+
+        # Optimization Op
+        self._optimize = tf.train.AdamOptimizer(self._learning_rate).apply_gradients(zip(self._actor_gradients, self._network_params))
+
+        self._num_trainable_vars = len(self._network_params) + len(self._target_network_params)
+
+    def create_network(self):
+        input_state = tflearn.input_data(shape=[None, self._dim_state], name='input_state')
+        input_goal = tflearn.input_data(shape=[None, self._dim_goal], name='input_goal')
+
+        input_memory = tflearn.input_data(shape=[None, MAX_STEPS, self._dim_state + self._dim_action])
+
+        input_ff = tflearn.merge([input_goal, input_state], 'concat')
+
+        ff_branch = tflearn.fully_connected(input_ff, UNITS)
+        ff_branch = tflearn.activations.relu(ff_branch)
+
+        # recurrent_branch = tflearn.fully_connected(input_memory, UNITS)
+        # recurrent_branch = tflearn.activations.relu(recurrent_branch)
+        recurrent_branch = tflearn.lstm(input_memory, UNITS, dynamic=True)
+
+        merged_branch = tflearn.merge([ff_branch, recurrent_branch], 'concat')
+        merged_branch = tflearn.fully_connected(merged_branch, UNITS)
+        merged_branch = tflearn.activations.relu(merged_branch)
+
+        merged_branch = tflearn.fully_connected(merged_branch, UNITS)
+        merged_branch = tflearn.activations.relu(merged_branch)
+
+        weights_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
+        out = tflearn.fully_connected(
+            merged_branch, self._dim_action, activation='tanh', weights_init=weights_init)
+        # Scale output to -action_bound to action_bound
+        scaled_out = tf.multiply(out, self._action_bound)
+        return [input_state, input_goal, input_memory], out, scaled_out
+
+    def train(self, input_state, input_goal, input_history, a_gradient):
+        self._sess.run(self._optimize, feed_dict={
+            self._net_input_state: input_state,
+            self._net_input_goal: input_goal,
+            self._net_input_history: input_history,
+            self._action_gradient: a_gradient
+        })
+
+    def predict(self, input_state, input_goal, input_history):
+        return self._sess.run(self._net_scaled_out, feed_dict={
+            self._net_input_state: input_state,
+            self._net_input_goal: input_goal,
+            self._net_input_history: input_history,
+        })
+
+    def predict_target(self, input_state, input_goal, input_history):
+        return self._sess.run(self._target_scaled_out, feed_dict={
+            self._target_input_state: input_state,
+            self._target_input_goal: input_goal,
+            self._target_input_history: input_history,
+        })
+
+    def update_target_network(self):
+        self._sess.run(self._update_target_network_params)
+
+    def get_num_trainable_vars(self):
+        return self._num_trainable_vars
diff --git a/agent.py b/agent.py
new file mode 100644
index 0000000..b432189
--- /dev/null
+++ b/agent.py
@@ -0,0 +1,80 @@
+import gym
+import tensorflow as tf
+import numpy as np
+
+from actor import Actor
+from critic import Critic
+
+MAX_STEPS = 100
+TAU = 1e-3
+LEARNING_RATE = 5e-4
+
+class Agent:
+    def __init__(self, experiment, batch_size):
+        self._dummy_env = gym.make(experiment)
+        self._sess = tf.Session()
+        
+        # Hardcoded for now
+        self._dim_state = 10
+        self._dim_goal = 3
+        self._dim_action = self._dummy_env.action_space.shape[0]
+        self._dim_env = 1
+        self._batch_size = batch_size
+       
+        self._actor = Actor(self._sess,
+            self._dim_state, self._dim_goal, self._dim_action, self._dummy_env, TAU, LEARNING_RATE, self._batch_size)
+
+        self._critic = Critic(self._sess,
+            self._dim_state, self._dim_goal, self._dim_action, self._dim_env, self._dummy_env, TAU, LEARNING_RATE, self._actor.get_num_trainable_vars())
+
+        self._sess.run(tf.global_variables_initializer())
+
+        self._actor.update_target_network()
+        self._critic.update_target_network()
+
+        #loss_summary = tf.summary.scalar('loss', self._critic._loss)
+        #writer = tf.summary.FileWriter('logs/')
+        #writer.add_summary(
+        #writer.add_graph(tf.get_default_graph())
+        #writer.flush()
+
+    def evaluate_actor(self, actor_predict, obs, goal, history):
+
+        assert (history.shape[0] == MAX_STEPS), "history must be of size MAX_STEPS"
+
+        obs = obs.reshape(1, self._dim_state)
+        goal = goal.reshape(1, self._dim_goal)
+        history = history.reshape(1, history.shape[0], history.shape[1])
+
+        return actor_predict(obs, goal, history)
+
+    def evaluate_actor_batch(self, actor_predict, obs, goal, history):
+
+        return actor_predict(obs, goal, history)
+
+    def evaluate_critic(self, critic_predict, obs, action, goal, history, env):
+        obs = obs.reshape(1, self._dim_state)
+        goal = goal.reshape(1, self._dim_goal)
+        action = action.reshape(1, self._dim_action)
+        history = history.reshape(1, history.shape[0], history.shape[1])
+        env = env.reshape(1, self._dim_env)
+
+        return critic_predict(env, obs, goal, action, history)
+
+    def evaluate_critic_batch(self, critic_predict, obs, action, goal, history, env):
+        return critic_predict(env, obs, goal, action, history)
+
+    def train_critic(self, obs, action, goal, history, env, predicted_q_value):
+        return self._critic.train(env, obs, goal, action, history, predicted_q_value)
+
+    def train_actor(self, obs, goal, history, a_gradient):
+        return self._actor.train(obs, goal, history, a_gradient)
+
+    def action_gradients_critic(self, obs, action, goal, history, env):
+        return self._critic.action_gradients(env, obs, goal, action, history)
+
+    def update_target_actor(self):
+        self._actor.update_target_network()
+
+    def update_target_critic(self):
+        self._critic.update_target_network()
diff --git a/critic.py b/critic.py
new file mode 100644
index 0000000..aa29144
--- /dev/null
+++ b/critic.py
@@ -0,0 +1,126 @@
+import tensorflow as tf
+import tflearn
+
+UNITS = 128
+MAX_STEPS = 100
+
+class Critic:
+    def __init__(self, session, dim_state, dim_goal, dim_action, dim_env, env, tau, learning_rate, num_actor_vars):
+        self._sess = session
+
+        self._dim_state = dim_state
+        self._dim_action = dim_action
+        self._dim_env = dim_env
+        self._dim_goal = dim_goal
+        self._action_bound = env.action_space.high
+
+        self._learning_rate = learning_rate
+        self._tau = tau
+
+
+        self._net_inputs, self._net_out = self.create_network()
+
+        self._net_input_env, self._net_input_goal, self._net_input_action, self._net_input_state, self._net_input_history = self._net_inputs
+
+        self._network_params = tf.trainable_variables()[num_actor_vars:]
+
+        self._target_inputs, self._target_out = self.create_network()
+
+        self._target_input_env, self._target_input_goal, self._target_input_action, self._target_input_state, self._target_input_history = self._target_inputs
+
+        self._target_network_params = tf.trainable_variables()[(len(self._network_params) + num_actor_vars):]
+
+        # Op for periodically updating target network with online network
+        # weights with regularization
+        self._update_target_network_params = \
+            [self._target_network_params[i].assign(tf.multiply(self._network_params[i], self._tau) \
+            + tf.multiply(self._target_network_params[i], 1. - self._tau))
+                for i in range(len(self._target_network_params))]
+
+        # Network target (y_i)
+        self._predicted_q_value = tf.placeholder(tf.float32, [None, 1])
+
+        # Define loss and optimization Op
+        self._loss = tflearn.mean_square(self._predicted_q_value, self._net_out)
+        self._optimize = tf.train.AdamOptimizer(
+            self._learning_rate).minimize(self._loss)
+
+        # Get the gradient of the net w.r.t. the action.
+        # For each action in the minibatch (i.e., for each x in xs),
+        # this will sum up the gradients of each critic output in the minibatch
+        # w.r.t. that action. Each output is independent of all
+        # actions except for one.
+        self._action_grads = tf.gradients(self._net_out, self._net_input_action)    
+
+    def create_network(self):
+        input_state = tflearn.input_data(shape=[None, self._dim_state])
+        input_goal = tflearn.input_data(shape=[None, self._dim_goal])
+        input_action = tflearn.input_data(shape=[None, self._dim_action])
+        input_env = tflearn.input_data(shape=[None, self._dim_env])
+
+        input_history = tflearn.input_data(shape=[None, MAX_STEPS, self._dim_action + self._dim_state])
+
+        input_ff = tflearn.merge(
+            [input_env, input_goal, input_action, input_state], 'concat')
+
+        ff_branch = tflearn.fully_connected(input_ff, UNITS)
+        ff_branch = tflearn.activations.relu(ff_branch)
+
+        #recurrent_branch = tflearn.fully_connected(inputs, UNITS)
+        #recurrent_branch = tflearn.activations.relu(recurrent_branch)
+        recurrent_branch = tflearn.lstm(input_history, UNITS, dynamic=True)
+
+        merged_branch = tflearn.merge([ff_branch, recurrent_branch], 'concat')
+        merged_branch = tflearn.fully_connected(merged_branch, UNITS)
+        merged_branch = tflearn.activations.relu(merged_branch)
+
+        merged_branch = tflearn.fully_connected(merged_branch, UNITS)
+        merged_branch = tflearn.activations.relu(merged_branch)
+
+        weights_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
+        out = tflearn.fully_connected(
+            merged_branch, 1, activation='linear', weights_init=weights_init)
+
+        return [input_env, input_goal, input_action, input_state, input_history], out
+
+
+    def train(self, input_env, input_state, input_goal, input_action, input_history, predicted_q_value):
+        return self._sess.run([self._net_out, self._optimize], feed_dict={
+            self._net_input_env: input_env,
+            self._net_input_state:  input_state,
+            self._net_input_goal:  input_goal,
+            self._net_input_action:  input_action,
+            self._net_input_history:  input_history,
+
+            self._predicted_q_value: predicted_q_value
+        })
+
+    def predict(self, input_env, input_state, input_goal, input_action, input_history):
+        return self._sess.run(self._net_out, feed_dict={
+            self._net_input_env: input_env,
+            self._net_input_state: input_state,
+            self._net_input_goal: input_goal,
+            self._net_input_action: input_action,
+            self._net_input_history: input_history,
+        })
+
+    def predict_target(self, input_env, input_state, input_goal, input_action, input_history):
+        return self._sess.run(self._target_out, feed_dict={
+            self._target_input_env: input_env,
+            self._target_input_state: input_state,
+            self._target_input_goal: input_goal,
+            self._target_input_action: input_action,
+            self._target_input_history: input_history,
+        })
+
+    def action_gradients(self, input_env, input_state, input_goal, input_action, input_history):
+        return self._sess.run(self._action_grads, feed_dict={
+            self._net_input_env: input_env,
+            self._net_input_state: input_state,
+            self._net_input_goal: input_goal,
+            self._net_input_action: input_action,
+            self._net_input_history: input_history
+        })
+
+    def update_target_network(self):
+        self._sess.run(self._update_target_network_params)
diff --git a/environment.py b/environment.py
new file mode 100644
index 0000000..e958e21
--- /dev/null
+++ b/environment.py
@@ -0,0 +1,24 @@
+import gym
+
+class RandomizedEnvironment:
+    """ Randomized environment class """
+    def __init__(self, experiment, parameter_ranges, goal_range):
+        self._experiment = experiment
+        self._parameter_ranges = parameter_ranges
+        self._goal_range = goal_range
+        self._params = [0]
+
+    def sample_env(self):
+        self._params = [0]
+        self._env = gym.make(self._experiment)
+        self._env.env.reward_type="dense"        
+    def get_env(self):
+        """
+            Returns a randomized environment and the vector of the parameter
+            space that corresponds to this very instance
+        """
+        return self._env, self._params
+
+    def get_goal(self):
+        return
+
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..6c0cbe2
--- /dev/null
+++ b/main.py
@@ -0,0 +1,94 @@
+import numpy as np
+import gym
+
+from environment import RandomizedEnvironment
+from agent import Agent
+from replay_buffer import ReplayBuffer
+
+EPISODES = 1000
+
+experiment = "FetchReach-v1"
+env = gym.make(experiment)
+
+# Initialize networks
+BATCH_SIZE = 128
+BUFFER_SIZE = 100000
+MAX_STEPS = 100
+GAMMA = 0.99
+
+agent = Agent(experiment, BATCH_SIZE)
+randomized_environment = RandomizedEnvironment(experiment, [], [])
+
+replay_buffer = ReplayBuffer(BUFFER_SIZE)
+
+dim_history_atom = agent._dim_state + agent._dim_action
+
+randomized_environment.sample_env()
+env, env_params = randomized_environment.get_env()
+
+success = 0
+
+for episode in range(EPISODES):
+    history = np.array(MAX_STEPS*[np.zeros(dim_history_atom)])
+    # generate a rollout
+
+    obs_dict = env.reset()
+    last_action = env.action_space.sample() # fake last_action, to feed the network
+
+    obs = obs_dict['observation']
+    history = np.append(history, [np.concatenate((last_action, obs))], axis = 0)[1:]
+
+    done = False
+
+    print("Episode : {}".format(episode))
+    tot_rew = 0
+
+    while not done:
+        obs = obs_dict['observation']
+        goal = obs_dict['desired_goal']
+        action = agent.evaluate_actor(agent._actor.predict, obs, goal, history)
+
+        if(episode > 600):
+            env.render()
+
+        new_obs_dict, step_reward, done, info = env.step(action[0])
+        tot_rew += step_reward
+        new_obs = new_obs_dict['observation']
+
+        history = np.append(history, [np.concatenate((action[0], obs))], axis = 0)[1:]
+
+        replay_buffer.add(obs, action, step_reward, done, new_obs, history, env_params, goal)
+
+        obs = new_obs
+
+        if done and info['is_success'] > 0.01:
+            success += 1
+            print("Success ! {}/{} ({})".format(success, episode, success/episode))
+
+        if replay_buffer.size() > BATCH_SIZE and done:
+            s_batch, a_batch, r_batch, t_batch, s2_batch, history_batch, env_batch, goal_batch = replay_buffer.sample_batch(BATCH_SIZE)
+
+            target_action_batch = agent.evaluate_actor_batch(agent._actor.predict_target, s2_batch, goal_batch, history_batch)
+
+            predicted_actions = agent.evaluate_actor_batch(agent._actor.predict, s2_batch, goal_batch, history_batch)
+            target_q = agent.evaluate_critic_batch(agent._critic.predict_target, s2_batch, predicted_actions, goal_batch, history_batch, env_batch)
+
+            y_i = []
+            for k in range(BATCH_SIZE):
+                if t_batch[k]:
+                    y_i.append(r_batch[k])
+                else:
+                    y_i.append(r_batch[k] + GAMMA * target_q[k])
+
+            predicted_q_value, _ = agent.train_critic(s_batch, a_batch, goal_batch, history_batch, env_batch, np.reshape(y_i, (BATCH_SIZE, 1)))
+
+
+            # Update the actor policy using the sampled gradient
+            a_outs = agent.evaluate_actor_batch(agent._actor.predict, s_batch, goal_batch, history_batch)                
+            grads = agent.action_gradients_critic(s_batch, a_outs, goal_batch, history_batch, env_batch)
+            agent.train_actor(s_batch, goal_batch, history_batch, grads[0])
+
+            # Update target networks
+            agent.update_target_actor()
+            agent.update_target_critic()
+    print("Reward : {}".format(tot_rew))
diff --git a/replay_buffer.py b/replay_buffer.py
new file mode 100644
index 0000000..1c8de0c
--- /dev/null
+++ b/replay_buffer.py
@@ -0,0 +1,54 @@
+""" 
+Data structure for implementing experience replay
+
+Author: Patrick Emami
+"""
+from collections import deque
+import random
+import numpy as np
+
+class ReplayBuffer(object):
+
+    def __init__(self, buffer_size, random_seed=123):
+        """
+        The right side of the deque contains the most recent experiences 
+        """
+        self.buffer_size = buffer_size
+        self.count = 0
+        self.buffer = deque()
+        random.seed(random_seed)
+
+    def add(self, s, a, r, t, s2, history, env, goal):
+        experience = (s, a, r, t, s2, history, env, goal)
+        if self.count < self.buffer_size: 
+            self.buffer.append(experience)
+            self.count += 1
+        else:
+            self.buffer.popleft()
+            self.buffer.append(experience)
+
+    def size(self):
+        return self.count
+
+    def sample_batch(self, batch_size):
+        batch = []
+
+        if self.count < batch_size:
+            batch = random.sample(self.buffer, self.count)
+        else:
+            batch = random.sample(self.buffer, batch_size)
+
+        s_batch = np.array([_[0] for _ in batch])
+        a_batch = np.array([_[1].reshape(4) for _ in batch])
+        r_batch = np.array([_[2] for _ in batch])
+        t_batch = np.array([_[3] for _ in batch])
+        s2_batch = np.array([_[4] for _ in batch])
+        history_batch = np.array([_[5] for _ in batch])
+        env_batch = np.array([_[6] for _ in batch])
+        goal_batch = np.array([_[7] for _ in batch])
+
+        return s_batch, a_batch, r_batch, t_batch, s2_batch, history_batch, env_batch, goal_batch
+
+    def clear(self):
+        self.buffer.clear()
+        self.count = 0