Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
little-nem committed May 31, 2019
0 parents commit 63c7972
Show file tree
Hide file tree
Showing 7 changed files with 481 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__pycache__/*
logs
101 changes: 101 additions & 0 deletions actor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import tensorflow as tf
import tflearn

UNITS = 128
MAX_STEPS = 100

class Actor:
def __init__(self, session, dim_state, dim_goal, dim_action, env, tau, learning_rate, batch_size):
self._sess = session

self._dim_state = dim_state
self._dim_action = dim_action
self._dim_goal = dim_goal
self._action_bound = env.action_space.high
self._internal_memory = []
self._tau = tau
self._learning_rate = learning_rate
self._batch_size = batch_size

self._net_inputs, self._net_out, self._net_scaled_out = self.create_network()
self._net_input_state, self._net_input_goal, self._net_input_history = self._net_inputs

self._network_params = tf.trainable_variables()

self._target_inputs, self._target_out, self._target_scaled_out = self.create_network()
self._target_input_state, self._target_input_goal, self._target_input_history = self._target_inputs

self._target_network_params = tf.trainable_variables()[len(self._network_params):]

# Op for periodically updating target network with online network
# weights
self._update_target_network_params = [self._target_network_params[i].assign(tf.multiply(self._network_params[i], self._tau) + tf.multiply(self._target_network_params[i], 1. - self._tau)) for i in range(len(self._target_network_params))]

# This gradient will be provided by the critic network
self._action_gradient = tf.placeholder(tf.float32, [None, self._dim_action])

# Combine the gradients here
self._unnormalized_actor_gradients = tf.gradients(self._net_scaled_out, self._network_params, -self._action_gradient)
self._actor_gradients = list(map(lambda x: tf.div(x, self._batch_size), self._unnormalized_actor_gradients))

# Optimization Op
self._optimize = tf.train.AdamOptimizer(self._learning_rate).apply_gradients(zip(self._actor_gradients, self._network_params))

self._num_trainable_vars = len(self._network_params) + len(self._target_network_params)

def create_network(self):
input_state = tflearn.input_data(shape=[None, self._dim_state], name='input_state')
input_goal = tflearn.input_data(shape=[None, self._dim_goal], name='input_goal')

input_memory = tflearn.input_data(shape=[None, MAX_STEPS, self._dim_state + self._dim_action])

input_ff = tflearn.merge([input_goal, input_state], 'concat')

ff_branch = tflearn.fully_connected(input_ff, UNITS)
ff_branch = tflearn.activations.relu(ff_branch)

# recurrent_branch = tflearn.fully_connected(input_memory, UNITS)
# recurrent_branch = tflearn.activations.relu(recurrent_branch)
recurrent_branch = tflearn.lstm(input_memory, UNITS, dynamic=True)

merged_branch = tflearn.merge([ff_branch, recurrent_branch], 'concat')
merged_branch = tflearn.fully_connected(merged_branch, UNITS)
merged_branch = tflearn.activations.relu(merged_branch)

merged_branch = tflearn.fully_connected(merged_branch, UNITS)
merged_branch = tflearn.activations.relu(merged_branch)

weights_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
out = tflearn.fully_connected(
merged_branch, self._dim_action, activation='tanh', weights_init=weights_init)
# Scale output to -action_bound to action_bound
scaled_out = tf.multiply(out, self._action_bound)
return [input_state, input_goal, input_memory], out, scaled_out

def train(self, input_state, input_goal, input_history, a_gradient):
self._sess.run(self._optimize, feed_dict={
self._net_input_state: input_state,
self._net_input_goal: input_goal,
self._net_input_history: input_history,
self._action_gradient: a_gradient
})

def predict(self, input_state, input_goal, input_history):
return self._sess.run(self._net_scaled_out, feed_dict={
self._net_input_state: input_state,
self._net_input_goal: input_goal,
self._net_input_history: input_history,
})

def predict_target(self, input_state, input_goal, input_history):
return self._sess.run(self._target_scaled_out, feed_dict={
self._target_input_state: input_state,
self._target_input_goal: input_goal,
self._target_input_history: input_history,
})

def update_target_network(self):
self._sess.run(self._update_target_network_params)

def get_num_trainable_vars(self):
return self._num_trainable_vars
80 changes: 80 additions & 0 deletions agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import gym
import tensorflow as tf
import numpy as np

from actor import Actor
from critic import Critic

MAX_STEPS = 100
TAU = 1e-3
LEARNING_RATE = 5e-4

class Agent:
def __init__(self, experiment, batch_size):
self._dummy_env = gym.make(experiment)
self._sess = tf.Session()

# Hardcoded for now
self._dim_state = 10
self._dim_goal = 3
self._dim_action = self._dummy_env.action_space.shape[0]
self._dim_env = 1
self._batch_size = batch_size

self._actor = Actor(self._sess,
self._dim_state, self._dim_goal, self._dim_action, self._dummy_env, TAU, LEARNING_RATE, self._batch_size)

self._critic = Critic(self._sess,
self._dim_state, self._dim_goal, self._dim_action, self._dim_env, self._dummy_env, TAU, LEARNING_RATE, self._actor.get_num_trainable_vars())

self._sess.run(tf.global_variables_initializer())

self._actor.update_target_network()
self._critic.update_target_network()

#loss_summary = tf.summary.scalar('loss', self._critic._loss)
#writer = tf.summary.FileWriter('logs/')
#writer.add_summary(
#writer.add_graph(tf.get_default_graph())
#writer.flush()

def evaluate_actor(self, actor_predict, obs, goal, history):

assert (history.shape[0] == MAX_STEPS), "history must be of size MAX_STEPS"

obs = obs.reshape(1, self._dim_state)
goal = goal.reshape(1, self._dim_goal)
history = history.reshape(1, history.shape[0], history.shape[1])

return actor_predict(obs, goal, history)

def evaluate_actor_batch(self, actor_predict, obs, goal, history):

return actor_predict(obs, goal, history)

def evaluate_critic(self, critic_predict, obs, action, goal, history, env):
obs = obs.reshape(1, self._dim_state)
goal = goal.reshape(1, self._dim_goal)
action = action.reshape(1, self._dim_action)
history = history.reshape(1, history.shape[0], history.shape[1])
env = env.reshape(1, self._dim_env)

return critic_predict(env, obs, goal, action, history)

def evaluate_critic_batch(self, critic_predict, obs, action, goal, history, env):
return critic_predict(env, obs, goal, action, history)

def train_critic(self, obs, action, goal, history, env, predicted_q_value):
return self._critic.train(env, obs, goal, action, history, predicted_q_value)

def train_actor(self, obs, goal, history, a_gradient):
return self._actor.train(obs, goal, history, a_gradient)

def action_gradients_critic(self, obs, action, goal, history, env):
return self._critic.action_gradients(env, obs, goal, action, history)

def update_target_actor(self):
self._actor.update_target_network()

def update_target_critic(self):
self._critic.update_target_network()
126 changes: 126 additions & 0 deletions critic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import tensorflow as tf
import tflearn

UNITS = 128
MAX_STEPS = 100

class Critic:
def __init__(self, session, dim_state, dim_goal, dim_action, dim_env, env, tau, learning_rate, num_actor_vars):
self._sess = session

self._dim_state = dim_state
self._dim_action = dim_action
self._dim_env = dim_env
self._dim_goal = dim_goal
self._action_bound = env.action_space.high

self._learning_rate = learning_rate
self._tau = tau


self._net_inputs, self._net_out = self.create_network()

self._net_input_env, self._net_input_goal, self._net_input_action, self._net_input_state, self._net_input_history = self._net_inputs

self._network_params = tf.trainable_variables()[num_actor_vars:]

self._target_inputs, self._target_out = self.create_network()

self._target_input_env, self._target_input_goal, self._target_input_action, self._target_input_state, self._target_input_history = self._target_inputs

self._target_network_params = tf.trainable_variables()[(len(self._network_params) + num_actor_vars):]

# Op for periodically updating target network with online network
# weights with regularization
self._update_target_network_params = \
[self._target_network_params[i].assign(tf.multiply(self._network_params[i], self._tau) \
+ tf.multiply(self._target_network_params[i], 1. - self._tau))
for i in range(len(self._target_network_params))]

# Network target (y_i)
self._predicted_q_value = tf.placeholder(tf.float32, [None, 1])

# Define loss and optimization Op
self._loss = tflearn.mean_square(self._predicted_q_value, self._net_out)
self._optimize = tf.train.AdamOptimizer(
self._learning_rate).minimize(self._loss)

# Get the gradient of the net w.r.t. the action.
# For each action in the minibatch (i.e., for each x in xs),
# this will sum up the gradients of each critic output in the minibatch
# w.r.t. that action. Each output is independent of all
# actions except for one.
self._action_grads = tf.gradients(self._net_out, self._net_input_action)

def create_network(self):
input_state = tflearn.input_data(shape=[None, self._dim_state])
input_goal = tflearn.input_data(shape=[None, self._dim_goal])
input_action = tflearn.input_data(shape=[None, self._dim_action])
input_env = tflearn.input_data(shape=[None, self._dim_env])

input_history = tflearn.input_data(shape=[None, MAX_STEPS, self._dim_action + self._dim_state])

input_ff = tflearn.merge(
[input_env, input_goal, input_action, input_state], 'concat')

ff_branch = tflearn.fully_connected(input_ff, UNITS)
ff_branch = tflearn.activations.relu(ff_branch)

#recurrent_branch = tflearn.fully_connected(inputs, UNITS)
#recurrent_branch = tflearn.activations.relu(recurrent_branch)
recurrent_branch = tflearn.lstm(input_history, UNITS, dynamic=True)

merged_branch = tflearn.merge([ff_branch, recurrent_branch], 'concat')
merged_branch = tflearn.fully_connected(merged_branch, UNITS)
merged_branch = tflearn.activations.relu(merged_branch)

merged_branch = tflearn.fully_connected(merged_branch, UNITS)
merged_branch = tflearn.activations.relu(merged_branch)

weights_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
out = tflearn.fully_connected(
merged_branch, 1, activation='linear', weights_init=weights_init)

return [input_env, input_goal, input_action, input_state, input_history], out


def train(self, input_env, input_state, input_goal, input_action, input_history, predicted_q_value):
return self._sess.run([self._net_out, self._optimize], feed_dict={
self._net_input_env: input_env,
self._net_input_state: input_state,
self._net_input_goal: input_goal,
self._net_input_action: input_action,
self._net_input_history: input_history,

self._predicted_q_value: predicted_q_value
})

def predict(self, input_env, input_state, input_goal, input_action, input_history):
return self._sess.run(self._net_out, feed_dict={
self._net_input_env: input_env,
self._net_input_state: input_state,
self._net_input_goal: input_goal,
self._net_input_action: input_action,
self._net_input_history: input_history,
})

def predict_target(self, input_env, input_state, input_goal, input_action, input_history):
return self._sess.run(self._target_out, feed_dict={
self._target_input_env: input_env,
self._target_input_state: input_state,
self._target_input_goal: input_goal,
self._target_input_action: input_action,
self._target_input_history: input_history,
})

def action_gradients(self, input_env, input_state, input_goal, input_action, input_history):
return self._sess.run(self._action_grads, feed_dict={
self._net_input_env: input_env,
self._net_input_state: input_state,
self._net_input_goal: input_goal,
self._net_input_action: input_action,
self._net_input_history: input_history
})

def update_target_network(self):
self._sess.run(self._update_target_network_params)
24 changes: 24 additions & 0 deletions environment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import gym

class RandomizedEnvironment:
""" Randomized environment class """
def __init__(self, experiment, parameter_ranges, goal_range):
self._experiment = experiment
self._parameter_ranges = parameter_ranges
self._goal_range = goal_range
self._params = [0]

def sample_env(self):
self._params = [0]
self._env = gym.make(self._experiment)
self._env.env.reward_type="dense"
def get_env(self):
"""
Returns a randomized environment and the vector of the parameter
space that corresponds to this very instance
"""
return self._env, self._params

def get_goal(self):
return

Loading

0 comments on commit 63c7972

Please sign in to comment.