Initial commit

little-nem · May 31, 2019 · 63c7972 · 63c7972
commit 63c7972
Show file tree

Hide file tree

Showing 7 changed files with 481 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/*
+logs
diff --git a/actor.py b/actor.py
@@ -0,0 +1,101 @@
+import tensorflow as tf
+import tflearn
+
+UNITS = 128
+MAX_STEPS = 100
+
+class Actor:
+    def __init__(self, session, dim_state, dim_goal, dim_action, env, tau, learning_rate, batch_size):
+        self._sess = session
+
+        self._dim_state = dim_state
+        self._dim_action = dim_action
+        self._dim_goal = dim_goal
+        self._action_bound = env.action_space.high
+        self._internal_memory = []
+        self._tau = tau
+        self._learning_rate = learning_rate
+        self._batch_size = batch_size
+
+        self._net_inputs, self._net_out, self._net_scaled_out = self.create_network()
+        self._net_input_state, self._net_input_goal, self._net_input_history = self._net_inputs
+
+        self._network_params = tf.trainable_variables()
+
+        self._target_inputs, self._target_out, self._target_scaled_out = self.create_network()
+        self._target_input_state, self._target_input_goal, self._target_input_history = self._target_inputs
+
+        self._target_network_params = tf.trainable_variables()[len(self._network_params):]
+
+        # Op for periodically updating target network with online network
+        # weights
+        self._update_target_network_params = [self._target_network_params[i].assign(tf.multiply(self._network_params[i], self._tau) + tf.multiply(self._target_network_params[i], 1. - self._tau)) for i in range(len(self._target_network_params))]
+
+        # This gradient will be provided by the critic network
+        self._action_gradient = tf.placeholder(tf.float32, [None, self._dim_action])
+
+        # Combine the gradients here
+        self._unnormalized_actor_gradients = tf.gradients(self._net_scaled_out, self._network_params, -self._action_gradient)
+        self._actor_gradients = list(map(lambda x: tf.div(x, self._batch_size), self._unnormalized_actor_gradients))
+
+        # Optimization Op
+        self._optimize = tf.train.AdamOptimizer(self._learning_rate).apply_gradients(zip(self._actor_gradients, self._network_params))
+
+        self._num_trainable_vars = len(self._network_params) + len(self._target_network_params)
+
+    def create_network(self):
+        input_state = tflearn.input_data(shape=[None, self._dim_state], name='input_state')
+        input_goal = tflearn.input_data(shape=[None, self._dim_goal], name='input_goal')
+
+        input_memory = tflearn.input_data(shape=[None, MAX_STEPS, self._dim_state + self._dim_action])
+
+        input_ff = tflearn.merge([input_goal, input_state], 'concat')
+
+        ff_branch = tflearn.fully_connected(input_ff, UNITS)
+        ff_branch = tflearn.activations.relu(ff_branch)
+
+        # recurrent_branch = tflearn.fully_connected(input_memory, UNITS)
+        # recurrent_branch = tflearn.activations.relu(recurrent_branch)
+        recurrent_branch = tflearn.lstm(input_memory, UNITS, dynamic=True)
+
+        merged_branch = tflearn.merge([ff_branch, recurrent_branch], 'concat')
+        merged_branch = tflearn.fully_connected(merged_branch, UNITS)
+        merged_branch = tflearn.activations.relu(merged_branch)
+
+        merged_branch = tflearn.fully_connected(merged_branch, UNITS)
+        merged_branch = tflearn.activations.relu(merged_branch)
+
+        weights_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
+        out = tflearn.fully_connected(
+            merged_branch, self._dim_action, activation='tanh', weights_init=weights_init)
+        # Scale output to -action_bound to action_bound
+        scaled_out = tf.multiply(out, self._action_bound)
+        return [input_state, input_goal, input_memory], out, scaled_out
+
+    def train(self, input_state, input_goal, input_history, a_gradient):
+        self._sess.run(self._optimize, feed_dict={
+            self._net_input_state: input_state,
+            self._net_input_goal: input_goal,
+            self._net_input_history: input_history,
+            self._action_gradient: a_gradient
+        })
+
+    def predict(self, input_state, input_goal, input_history):
+        return self._sess.run(self._net_scaled_out, feed_dict={
+            self._net_input_state: input_state,
+            self._net_input_goal: input_goal,
+            self._net_input_history: input_history,
+        })
+
+    def predict_target(self, input_state, input_goal, input_history):
+        return self._sess.run(self._target_scaled_out, feed_dict={
+            self._target_input_state: input_state,
+            self._target_input_goal: input_goal,
+            self._target_input_history: input_history,
+        })
+
+    def update_target_network(self):
+        self._sess.run(self._update_target_network_params)
+
+    def get_num_trainable_vars(self):
+        return self._num_trainable_vars
diff --git a/agent.py b/agent.py
@@ -0,0 +1,80 @@
+import gym
+import tensorflow as tf
+import numpy as np
+
+from actor import Actor
+from critic import Critic
+
+MAX_STEPS = 100
+TAU = 1e-3
+LEARNING_RATE = 5e-4
+
+class Agent:
+    def __init__(self, experiment, batch_size):
+        self._dummy_env = gym.make(experiment)
+        self._sess = tf.Session()
+
+        # Hardcoded for now
+        self._dim_state = 10
+        self._dim_goal = 3
+        self._dim_action = self._dummy_env.action_space.shape[0]
+        self._dim_env = 1
+        self._batch_size = batch_size
+
+        self._actor = Actor(self._sess,
+            self._dim_state, self._dim_goal, self._dim_action, self._dummy_env, TAU, LEARNING_RATE, self._batch_size)
+
+        self._critic = Critic(self._sess,
+            self._dim_state, self._dim_goal, self._dim_action, self._dim_env, self._dummy_env, TAU, LEARNING_RATE, self._actor.get_num_trainable_vars())
+
+        self._sess.run(tf.global_variables_initializer())
+
+        self._actor.update_target_network()
+        self._critic.update_target_network()
+
+        #loss_summary = tf.summary.scalar('loss', self._critic._loss)
+        #writer = tf.summary.FileWriter('logs/')
+        #writer.add_summary(
+        #writer.add_graph(tf.get_default_graph())
+        #writer.flush()
+
+    def evaluate_actor(self, actor_predict, obs, goal, history):
+
+        assert (history.shape[0] == MAX_STEPS), "history must be of size MAX_STEPS"
+
+        obs = obs.reshape(1, self._dim_state)
+        goal = goal.reshape(1, self._dim_goal)
+        history = history.reshape(1, history.shape[0], history.shape[1])
+
+        return actor_predict(obs, goal, history)
+
+    def evaluate_actor_batch(self, actor_predict, obs, goal, history):
+
+        return actor_predict(obs, goal, history)
+
+    def evaluate_critic(self, critic_predict, obs, action, goal, history, env):
+        obs = obs.reshape(1, self._dim_state)
+        goal = goal.reshape(1, self._dim_goal)
+        action = action.reshape(1, self._dim_action)
+        history = history.reshape(1, history.shape[0], history.shape[1])
+        env = env.reshape(1, self._dim_env)
+
+        return critic_predict(env, obs, goal, action, history)
+
+    def evaluate_critic_batch(self, critic_predict, obs, action, goal, history, env):
+        return critic_predict(env, obs, goal, action, history)
+
+    def train_critic(self, obs, action, goal, history, env, predicted_q_value):
+        return self._critic.train(env, obs, goal, action, history, predicted_q_value)
+
+    def train_actor(self, obs, goal, history, a_gradient):
+        return self._actor.train(obs, goal, history, a_gradient)
+
+    def action_gradients_critic(self, obs, action, goal, history, env):
+        return self._critic.action_gradients(env, obs, goal, action, history)
+
+    def update_target_actor(self):
+        self._actor.update_target_network()
+
+    def update_target_critic(self):
+        self._critic.update_target_network()
diff --git a/critic.py b/critic.py
@@ -0,0 +1,126 @@
+import tensorflow as tf
+import tflearn
+
+UNITS = 128
+MAX_STEPS = 100
+
+class Critic:
+    def __init__(self, session, dim_state, dim_goal, dim_action, dim_env, env, tau, learning_rate, num_actor_vars):
+        self._sess = session
+
+        self._dim_state = dim_state
+        self._dim_action = dim_action
+        self._dim_env = dim_env
+        self._dim_goal = dim_goal
+        self._action_bound = env.action_space.high
+
+        self._learning_rate = learning_rate
+        self._tau = tau
+
+
+        self._net_inputs, self._net_out = self.create_network()
+
+        self._net_input_env, self._net_input_goal, self._net_input_action, self._net_input_state, self._net_input_history = self._net_inputs
+
+        self._network_params = tf.trainable_variables()[num_actor_vars:]
+
+        self._target_inputs, self._target_out = self.create_network()
+
+        self._target_input_env, self._target_input_goal, self._target_input_action, self._target_input_state, self._target_input_history = self._target_inputs
+
+        self._target_network_params = tf.trainable_variables()[(len(self._network_params) + num_actor_vars):]
+
+        # Op for periodically updating target network with online network
+        # weights with regularization
+        self._update_target_network_params = \
+            [self._target_network_params[i].assign(tf.multiply(self._network_params[i], self._tau) \
+            + tf.multiply(self._target_network_params[i], 1. - self._tau))
+                for i in range(len(self._target_network_params))]
+
+        # Network target (y_i)
+        self._predicted_q_value = tf.placeholder(tf.float32, [None, 1])
+
+        # Define loss and optimization Op
+        self._loss = tflearn.mean_square(self._predicted_q_value, self._net_out)
+        self._optimize = tf.train.AdamOptimizer(
+            self._learning_rate).minimize(self._loss)
+
+        # Get the gradient of the net w.r.t. the action.
+        # For each action in the minibatch (i.e., for each x in xs),
+        # this will sum up the gradients of each critic output in the minibatch
+        # w.r.t. that action. Each output is independent of all
+        # actions except for one.
+        self._action_grads = tf.gradients(self._net_out, self._net_input_action)    
+
+    def create_network(self):
+        input_state = tflearn.input_data(shape=[None, self._dim_state])
+        input_goal = tflearn.input_data(shape=[None, self._dim_goal])
+        input_action = tflearn.input_data(shape=[None, self._dim_action])
+        input_env = tflearn.input_data(shape=[None, self._dim_env])
+
+        input_history = tflearn.input_data(shape=[None, MAX_STEPS, self._dim_action + self._dim_state])
+
+        input_ff = tflearn.merge(
+            [input_env, input_goal, input_action, input_state], 'concat')
+
+        ff_branch = tflearn.fully_connected(input_ff, UNITS)
+        ff_branch = tflearn.activations.relu(ff_branch)
+
+        #recurrent_branch = tflearn.fully_connected(inputs, UNITS)
+        #recurrent_branch = tflearn.activations.relu(recurrent_branch)
+        recurrent_branch = tflearn.lstm(input_history, UNITS, dynamic=True)
+
+        merged_branch = tflearn.merge([ff_branch, recurrent_branch], 'concat')
+        merged_branch = tflearn.fully_connected(merged_branch, UNITS)
+        merged_branch = tflearn.activations.relu(merged_branch)
+
+        merged_branch = tflearn.fully_connected(merged_branch, UNITS)
+        merged_branch = tflearn.activations.relu(merged_branch)
+
+        weights_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
+        out = tflearn.fully_connected(
+            merged_branch, 1, activation='linear', weights_init=weights_init)
+
+        return [input_env, input_goal, input_action, input_state, input_history], out
+
+
+    def train(self, input_env, input_state, input_goal, input_action, input_history, predicted_q_value):
+        return self._sess.run([self._net_out, self._optimize], feed_dict={
+            self._net_input_env: input_env,
+            self._net_input_state:  input_state,
+            self._net_input_goal:  input_goal,
+            self._net_input_action:  input_action,
+            self._net_input_history:  input_history,
+
+            self._predicted_q_value: predicted_q_value
+        })
+
+    def predict(self, input_env, input_state, input_goal, input_action, input_history):
+        return self._sess.run(self._net_out, feed_dict={
+            self._net_input_env: input_env,
+            self._net_input_state: input_state,
+            self._net_input_goal: input_goal,
+            self._net_input_action: input_action,
+            self._net_input_history: input_history,
+        })
+
+    def predict_target(self, input_env, input_state, input_goal, input_action, input_history):
+        return self._sess.run(self._target_out, feed_dict={
+            self._target_input_env: input_env,
+            self._target_input_state: input_state,
+            self._target_input_goal: input_goal,
+            self._target_input_action: input_action,
+            self._target_input_history: input_history,
+        })
+
+    def action_gradients(self, input_env, input_state, input_goal, input_action, input_history):
+        return self._sess.run(self._action_grads, feed_dict={
+            self._net_input_env: input_env,
+            self._net_input_state: input_state,
+            self._net_input_goal: input_goal,
+            self._net_input_action: input_action,
+            self._net_input_history: input_history
+        })
+
+    def update_target_network(self):
+        self._sess.run(self._update_target_network_params)
diff --git a/environment.py b/environment.py
@@ -0,0 +1,24 @@
+import gym
+
+class RandomizedEnvironment:
+    """ Randomized environment class """
+    def __init__(self, experiment, parameter_ranges, goal_range):
+        self._experiment = experiment
+        self._parameter_ranges = parameter_ranges
+        self._goal_range = goal_range
+        self._params = [0]
+
+    def sample_env(self):
+        self._params = [0]
+        self._env = gym.make(self._experiment)
+        self._env.env.reward_type="dense"        
+    def get_env(self):
+        """
+            Returns a randomized environment and the vector of the parameter
+            space that corresponds to this very instance
+        """
+        return self._env, self._params
+
+    def get_goal(self):
+        return
+