-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 63c7972
Showing
7 changed files
with
481 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
__pycache__/* | ||
logs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import tensorflow as tf | ||
import tflearn | ||
|
||
UNITS = 128 | ||
MAX_STEPS = 100 | ||
|
||
class Actor: | ||
def __init__(self, session, dim_state, dim_goal, dim_action, env, tau, learning_rate, batch_size): | ||
self._sess = session | ||
|
||
self._dim_state = dim_state | ||
self._dim_action = dim_action | ||
self._dim_goal = dim_goal | ||
self._action_bound = env.action_space.high | ||
self._internal_memory = [] | ||
self._tau = tau | ||
self._learning_rate = learning_rate | ||
self._batch_size = batch_size | ||
|
||
self._net_inputs, self._net_out, self._net_scaled_out = self.create_network() | ||
self._net_input_state, self._net_input_goal, self._net_input_history = self._net_inputs | ||
|
||
self._network_params = tf.trainable_variables() | ||
|
||
self._target_inputs, self._target_out, self._target_scaled_out = self.create_network() | ||
self._target_input_state, self._target_input_goal, self._target_input_history = self._target_inputs | ||
|
||
self._target_network_params = tf.trainable_variables()[len(self._network_params):] | ||
|
||
# Op for periodically updating target network with online network | ||
# weights | ||
self._update_target_network_params = [self._target_network_params[i].assign(tf.multiply(self._network_params[i], self._tau) + tf.multiply(self._target_network_params[i], 1. - self._tau)) for i in range(len(self._target_network_params))] | ||
|
||
# This gradient will be provided by the critic network | ||
self._action_gradient = tf.placeholder(tf.float32, [None, self._dim_action]) | ||
|
||
# Combine the gradients here | ||
self._unnormalized_actor_gradients = tf.gradients(self._net_scaled_out, self._network_params, -self._action_gradient) | ||
self._actor_gradients = list(map(lambda x: tf.div(x, self._batch_size), self._unnormalized_actor_gradients)) | ||
|
||
# Optimization Op | ||
self._optimize = tf.train.AdamOptimizer(self._learning_rate).apply_gradients(zip(self._actor_gradients, self._network_params)) | ||
|
||
self._num_trainable_vars = len(self._network_params) + len(self._target_network_params) | ||
|
||
def create_network(self): | ||
input_state = tflearn.input_data(shape=[None, self._dim_state], name='input_state') | ||
input_goal = tflearn.input_data(shape=[None, self._dim_goal], name='input_goal') | ||
|
||
input_memory = tflearn.input_data(shape=[None, MAX_STEPS, self._dim_state + self._dim_action]) | ||
|
||
input_ff = tflearn.merge([input_goal, input_state], 'concat') | ||
|
||
ff_branch = tflearn.fully_connected(input_ff, UNITS) | ||
ff_branch = tflearn.activations.relu(ff_branch) | ||
|
||
# recurrent_branch = tflearn.fully_connected(input_memory, UNITS) | ||
# recurrent_branch = tflearn.activations.relu(recurrent_branch) | ||
recurrent_branch = tflearn.lstm(input_memory, UNITS, dynamic=True) | ||
|
||
merged_branch = tflearn.merge([ff_branch, recurrent_branch], 'concat') | ||
merged_branch = tflearn.fully_connected(merged_branch, UNITS) | ||
merged_branch = tflearn.activations.relu(merged_branch) | ||
|
||
merged_branch = tflearn.fully_connected(merged_branch, UNITS) | ||
merged_branch = tflearn.activations.relu(merged_branch) | ||
|
||
weights_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003) | ||
out = tflearn.fully_connected( | ||
merged_branch, self._dim_action, activation='tanh', weights_init=weights_init) | ||
# Scale output to -action_bound to action_bound | ||
scaled_out = tf.multiply(out, self._action_bound) | ||
return [input_state, input_goal, input_memory], out, scaled_out | ||
|
||
def train(self, input_state, input_goal, input_history, a_gradient): | ||
self._sess.run(self._optimize, feed_dict={ | ||
self._net_input_state: input_state, | ||
self._net_input_goal: input_goal, | ||
self._net_input_history: input_history, | ||
self._action_gradient: a_gradient | ||
}) | ||
|
||
def predict(self, input_state, input_goal, input_history): | ||
return self._sess.run(self._net_scaled_out, feed_dict={ | ||
self._net_input_state: input_state, | ||
self._net_input_goal: input_goal, | ||
self._net_input_history: input_history, | ||
}) | ||
|
||
def predict_target(self, input_state, input_goal, input_history): | ||
return self._sess.run(self._target_scaled_out, feed_dict={ | ||
self._target_input_state: input_state, | ||
self._target_input_goal: input_goal, | ||
self._target_input_history: input_history, | ||
}) | ||
|
||
def update_target_network(self): | ||
self._sess.run(self._update_target_network_params) | ||
|
||
def get_num_trainable_vars(self): | ||
return self._num_trainable_vars |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import gym | ||
import tensorflow as tf | ||
import numpy as np | ||
|
||
from actor import Actor | ||
from critic import Critic | ||
|
||
MAX_STEPS = 100 | ||
TAU = 1e-3 | ||
LEARNING_RATE = 5e-4 | ||
|
||
class Agent: | ||
def __init__(self, experiment, batch_size): | ||
self._dummy_env = gym.make(experiment) | ||
self._sess = tf.Session() | ||
|
||
# Hardcoded for now | ||
self._dim_state = 10 | ||
self._dim_goal = 3 | ||
self._dim_action = self._dummy_env.action_space.shape[0] | ||
self._dim_env = 1 | ||
self._batch_size = batch_size | ||
|
||
self._actor = Actor(self._sess, | ||
self._dim_state, self._dim_goal, self._dim_action, self._dummy_env, TAU, LEARNING_RATE, self._batch_size) | ||
|
||
self._critic = Critic(self._sess, | ||
self._dim_state, self._dim_goal, self._dim_action, self._dim_env, self._dummy_env, TAU, LEARNING_RATE, self._actor.get_num_trainable_vars()) | ||
|
||
self._sess.run(tf.global_variables_initializer()) | ||
|
||
self._actor.update_target_network() | ||
self._critic.update_target_network() | ||
|
||
#loss_summary = tf.summary.scalar('loss', self._critic._loss) | ||
#writer = tf.summary.FileWriter('logs/') | ||
#writer.add_summary( | ||
#writer.add_graph(tf.get_default_graph()) | ||
#writer.flush() | ||
|
||
def evaluate_actor(self, actor_predict, obs, goal, history): | ||
|
||
assert (history.shape[0] == MAX_STEPS), "history must be of size MAX_STEPS" | ||
|
||
obs = obs.reshape(1, self._dim_state) | ||
goal = goal.reshape(1, self._dim_goal) | ||
history = history.reshape(1, history.shape[0], history.shape[1]) | ||
|
||
return actor_predict(obs, goal, history) | ||
|
||
def evaluate_actor_batch(self, actor_predict, obs, goal, history): | ||
|
||
return actor_predict(obs, goal, history) | ||
|
||
def evaluate_critic(self, critic_predict, obs, action, goal, history, env): | ||
obs = obs.reshape(1, self._dim_state) | ||
goal = goal.reshape(1, self._dim_goal) | ||
action = action.reshape(1, self._dim_action) | ||
history = history.reshape(1, history.shape[0], history.shape[1]) | ||
env = env.reshape(1, self._dim_env) | ||
|
||
return critic_predict(env, obs, goal, action, history) | ||
|
||
def evaluate_critic_batch(self, critic_predict, obs, action, goal, history, env): | ||
return critic_predict(env, obs, goal, action, history) | ||
|
||
def train_critic(self, obs, action, goal, history, env, predicted_q_value): | ||
return self._critic.train(env, obs, goal, action, history, predicted_q_value) | ||
|
||
def train_actor(self, obs, goal, history, a_gradient): | ||
return self._actor.train(obs, goal, history, a_gradient) | ||
|
||
def action_gradients_critic(self, obs, action, goal, history, env): | ||
return self._critic.action_gradients(env, obs, goal, action, history) | ||
|
||
def update_target_actor(self): | ||
self._actor.update_target_network() | ||
|
||
def update_target_critic(self): | ||
self._critic.update_target_network() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
import tensorflow as tf | ||
import tflearn | ||
|
||
UNITS = 128 | ||
MAX_STEPS = 100 | ||
|
||
class Critic: | ||
def __init__(self, session, dim_state, dim_goal, dim_action, dim_env, env, tau, learning_rate, num_actor_vars): | ||
self._sess = session | ||
|
||
self._dim_state = dim_state | ||
self._dim_action = dim_action | ||
self._dim_env = dim_env | ||
self._dim_goal = dim_goal | ||
self._action_bound = env.action_space.high | ||
|
||
self._learning_rate = learning_rate | ||
self._tau = tau | ||
|
||
|
||
self._net_inputs, self._net_out = self.create_network() | ||
|
||
self._net_input_env, self._net_input_goal, self._net_input_action, self._net_input_state, self._net_input_history = self._net_inputs | ||
|
||
self._network_params = tf.trainable_variables()[num_actor_vars:] | ||
|
||
self._target_inputs, self._target_out = self.create_network() | ||
|
||
self._target_input_env, self._target_input_goal, self._target_input_action, self._target_input_state, self._target_input_history = self._target_inputs | ||
|
||
self._target_network_params = tf.trainable_variables()[(len(self._network_params) + num_actor_vars):] | ||
|
||
# Op for periodically updating target network with online network | ||
# weights with regularization | ||
self._update_target_network_params = \ | ||
[self._target_network_params[i].assign(tf.multiply(self._network_params[i], self._tau) \ | ||
+ tf.multiply(self._target_network_params[i], 1. - self._tau)) | ||
for i in range(len(self._target_network_params))] | ||
|
||
# Network target (y_i) | ||
self._predicted_q_value = tf.placeholder(tf.float32, [None, 1]) | ||
|
||
# Define loss and optimization Op | ||
self._loss = tflearn.mean_square(self._predicted_q_value, self._net_out) | ||
self._optimize = tf.train.AdamOptimizer( | ||
self._learning_rate).minimize(self._loss) | ||
|
||
# Get the gradient of the net w.r.t. the action. | ||
# For each action in the minibatch (i.e., for each x in xs), | ||
# this will sum up the gradients of each critic output in the minibatch | ||
# w.r.t. that action. Each output is independent of all | ||
# actions except for one. | ||
self._action_grads = tf.gradients(self._net_out, self._net_input_action) | ||
|
||
def create_network(self): | ||
input_state = tflearn.input_data(shape=[None, self._dim_state]) | ||
input_goal = tflearn.input_data(shape=[None, self._dim_goal]) | ||
input_action = tflearn.input_data(shape=[None, self._dim_action]) | ||
input_env = tflearn.input_data(shape=[None, self._dim_env]) | ||
|
||
input_history = tflearn.input_data(shape=[None, MAX_STEPS, self._dim_action + self._dim_state]) | ||
|
||
input_ff = tflearn.merge( | ||
[input_env, input_goal, input_action, input_state], 'concat') | ||
|
||
ff_branch = tflearn.fully_connected(input_ff, UNITS) | ||
ff_branch = tflearn.activations.relu(ff_branch) | ||
|
||
#recurrent_branch = tflearn.fully_connected(inputs, UNITS) | ||
#recurrent_branch = tflearn.activations.relu(recurrent_branch) | ||
recurrent_branch = tflearn.lstm(input_history, UNITS, dynamic=True) | ||
|
||
merged_branch = tflearn.merge([ff_branch, recurrent_branch], 'concat') | ||
merged_branch = tflearn.fully_connected(merged_branch, UNITS) | ||
merged_branch = tflearn.activations.relu(merged_branch) | ||
|
||
merged_branch = tflearn.fully_connected(merged_branch, UNITS) | ||
merged_branch = tflearn.activations.relu(merged_branch) | ||
|
||
weights_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003) | ||
out = tflearn.fully_connected( | ||
merged_branch, 1, activation='linear', weights_init=weights_init) | ||
|
||
return [input_env, input_goal, input_action, input_state, input_history], out | ||
|
||
|
||
def train(self, input_env, input_state, input_goal, input_action, input_history, predicted_q_value): | ||
return self._sess.run([self._net_out, self._optimize], feed_dict={ | ||
self._net_input_env: input_env, | ||
self._net_input_state: input_state, | ||
self._net_input_goal: input_goal, | ||
self._net_input_action: input_action, | ||
self._net_input_history: input_history, | ||
|
||
self._predicted_q_value: predicted_q_value | ||
}) | ||
|
||
def predict(self, input_env, input_state, input_goal, input_action, input_history): | ||
return self._sess.run(self._net_out, feed_dict={ | ||
self._net_input_env: input_env, | ||
self._net_input_state: input_state, | ||
self._net_input_goal: input_goal, | ||
self._net_input_action: input_action, | ||
self._net_input_history: input_history, | ||
}) | ||
|
||
def predict_target(self, input_env, input_state, input_goal, input_action, input_history): | ||
return self._sess.run(self._target_out, feed_dict={ | ||
self._target_input_env: input_env, | ||
self._target_input_state: input_state, | ||
self._target_input_goal: input_goal, | ||
self._target_input_action: input_action, | ||
self._target_input_history: input_history, | ||
}) | ||
|
||
def action_gradients(self, input_env, input_state, input_goal, input_action, input_history): | ||
return self._sess.run(self._action_grads, feed_dict={ | ||
self._net_input_env: input_env, | ||
self._net_input_state: input_state, | ||
self._net_input_goal: input_goal, | ||
self._net_input_action: input_action, | ||
self._net_input_history: input_history | ||
}) | ||
|
||
def update_target_network(self): | ||
self._sess.run(self._update_target_network_params) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import gym | ||
|
||
class RandomizedEnvironment: | ||
""" Randomized environment class """ | ||
def __init__(self, experiment, parameter_ranges, goal_range): | ||
self._experiment = experiment | ||
self._parameter_ranges = parameter_ranges | ||
self._goal_range = goal_range | ||
self._params = [0] | ||
|
||
def sample_env(self): | ||
self._params = [0] | ||
self._env = gym.make(self._experiment) | ||
self._env.env.reward_type="dense" | ||
def get_env(self): | ||
""" | ||
Returns a randomized environment and the vector of the parameter | ||
space that corresponds to this very instance | ||
""" | ||
return self._env, self._params | ||
|
||
def get_goal(self): | ||
return | ||
|
Oops, something went wrong.