-
Notifications
You must be signed in to change notification settings - Fork 121
/
model.py
116 lines (98 loc) · 5.62 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import numpy as np
import tensorflow as tf
from utils import conv2d_flipkernel
def VI_Block(X, S1, S2, config):
k = config.k # Number of value iterations performed
ch_i = config.ch_i # Channels in input layer
ch_h = config.ch_h # Channels in initial hidden layer
ch_q = config.ch_q # Channels in q layer (~actions)
state_batch_size = config.statebatchsize # k+1 state inputs for each channel
bias = tf.Variable(np.random.randn(1, 1, 1, ch_h) * 0.01, dtype=tf.float32)
# weights from inputs to q layer (~reward in Bellman equation)
w0 = tf.Variable(np.random.randn(3, 3, ch_i, ch_h) * 0.01, dtype=tf.float32)
w1 = tf.Variable(np.random.randn(1, 1, ch_h, 1) * 0.01, dtype=tf.float32)
w = tf.Variable(np.random.randn(3, 3, 1, ch_q) * 0.01, dtype=tf.float32)
# feedback weights from v layer into q layer (~transition probabilities in Bellman equation)
w_fb = tf.Variable(np.random.randn(3, 3, 1, ch_q) * 0.01, dtype=tf.float32)
w_o = tf.Variable(np.random.randn(ch_q, 8) * 0.01, dtype=tf.float32)
# initial conv layer over image+reward prior
h = conv2d_flipkernel(X, w0, name="h0") + bias
r = conv2d_flipkernel(h, w1, name="r")
q = conv2d_flipkernel(r, w, name="q")
v = tf.reduce_max(q, axis=3, keep_dims=True, name="v")
for i in range(0, k-1):
rv = tf.concat([r, v], 3)
wwfb = tf.concat([w, w_fb], 2)
q = conv2d_flipkernel(rv, wwfb, name="q")
v = tf.reduce_max(q, axis=3, keep_dims=True, name="v")
# do one last convolution
q = conv2d_flipkernel(tf.concat([r, v], 3),
tf.concat([w, w_fb], 2), name="q")
# CHANGE TO THEANO ORDERING
# Since we are selecting over channels, it becomes easier to work with
# the tensor when it is in NCHW format vs NHWC
q = tf.transpose(q, perm=[0, 3, 1, 2])
# Select the conv-net channels at the state position (S1,S2).
# This intuitively corresponds to each channel representing an action, and the convnet the Q function.
# The tricky thing is we want to select the same (S1,S2) position *for each* channel and for each sample
# TODO: performance can be improved here by substituting expensive
# transpose calls with better indexing for gather_nd
bs = tf.shape(q)[0]
rprn = tf.reshape(tf.tile(tf.reshape(tf.range(bs), [-1, 1]), [1, state_batch_size]), [-1])
ins1 = tf.cast(tf.reshape(S1, [-1]), tf.int32)
ins2 = tf.cast(tf.reshape(S2, [-1]), tf.int32)
idx_in = tf.transpose(tf.stack([ins1, ins2, rprn]), [1, 0])
q_out = tf.gather_nd(tf.transpose(q, [2, 3, 0, 1]), idx_in, name="q_out")
# add logits
logits = tf.matmul(q_out, w_o)
# softmax output weights
output = tf.nn.softmax(logits, name="output")
return logits, output
# similar to the normal VI_Block except there are separate weights for each q layer
def VI_Untied_Block(X, S1, S2, config):
k = config.k # Number of value iterations performed
ch_i = config.ch_i # Channels in input layer
ch_h = config.ch_h # Channels in initial hidden layer
ch_q = config.ch_q # Channels in q layer (~actions)
state_batch_size = config.statebatchsize # k+1 state inputs for each channel
bias = tf.Variable(np.random.randn(1, 1, 1, ch_h) * 0.01, dtype=tf.float32)
# weights from inputs to q layer (~reward in Bellman equation)
w0 = tf.Variable(np.random.randn(3, 3, ch_i, ch_h) * 0.01, dtype=tf.float32)
w1 = tf.Variable(np.random.randn(1, 1, ch_h, 1) * 0.01, dtype=tf.float32)
w_l = [tf.Variable(np.random.randn(3, 3, 1, ch_q) * 0.01, dtype=tf.float32) for i in range(0, k+1)]
# feedback weights from v layer into q layer (~transition probabilities in Bellman equation)
w_fb_l = [tf.Variable(np.random.randn(3, 3, 1, ch_q) * 0.01, dtype=tf.float32) for i in range(0,k)]
w_o = tf.Variable(np.random.randn(ch_q, 8) * 0.01, dtype=tf.float32)
# initial conv layer over image+reward prior
h = conv2d_flipkernel(X, w0, name="h0") + bias
r = conv2d_flipkernel(h, w1, name="r")
q = conv2d_flipkernel(r, w_l[0], name="q")
v = tf.reduce_max(q, axis=3, keep_dims=True, name="v")
for i in range(0, k-1):
rv = tf.concat([r, v], 3)
wwfb = tf.concat([w_l[i+1], w_fb_l[i]], 2)
q = conv2d_flipkernel(rv, wwfb, name="q")
v = tf.reduce_max(q, axis=3, keep_dims=True, name="v")
# do one last convolution
q = conv2d_flipkernel(tf.concat([r, v], 3),
tf.concat([w_l[k], w_fb_l[k-1]], 2), name="q")
# CHANGE TO THEANO ORDERING
# Since we are selecting over channels, it becomes easier to work with
# the tensor when it is in NCHW format vs NHWC
q = tf.transpose(q, perm=[0, 3, 1, 2])
# Select the conv-net channels at the state position (S1,S2).
# This intuitively corresponds to each channel representing an action, and the convnet the Q function.
# The tricky thing is we want to select the same (S1,S2) position *for each* channel and for each sample
# TODO: performance can be improved here by substituting expensive
# transpose calls with better indexing for gather_nd
bs = tf.shape(q)[0]
rprn = tf.reshape(tf.tile(tf.reshape(tf.range(bs), [-1, 1]), [1, state_batch_size]), [-1])
ins1 = tf.cast(tf.reshape(S1, [-1]), tf.int32)
ins2 = tf.cast(tf.reshape(S2, [-1]), tf.int32)
idx_in = tf.transpose(tf.stack([ins1, ins2, rprn]), [1, 0])
q_out = tf.gather_nd(tf.transpose(q, [2, 3, 0, 1]), idx_in, name="q_out")
# add logits
logits = tf.matmul(q_out, w_o)
# softmax output weights
output = tf.nn.softmax(logits, name="output")
return logits, output