Skip to content
This repository has been archived by the owner on Jan 1, 2021. It is now read-only.

Fixed data.py preprocessing and model.py for training #66

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 29 additions & 29 deletions assignments/chatbot/data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
""" A neural chatbot using sequence to sequence model with
attentional decoder.
attentional decoder.

This is based on Google Translate Tensorflow model
This is based on Google Translate Tensorflow model
https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/

Sequence to sequence model by Cho et al.(2014)
Expand Down Expand Up @@ -31,7 +31,7 @@ def get_lines():
with open(file_path, 'rb') as f:
lines = f.readlines()
for line in lines:
parts = line.split(' +++$+++ ')
parts = line.split(b' +++$+++ ')
if len(parts) == 5:
if parts[4][-1] == '\n':
parts[4] = parts[4][:-1]
Expand All @@ -44,10 +44,10 @@ def get_convos():
convos = []
with open(file_path, 'rb') as f:
for line in f.readlines():
parts = line.split(' +++$+++ ')
parts = line.split(b' +++$+++ ')
if len(parts) == 4:
convo = []
for line in parts[3][1:-2].split(', '):
for line in parts[3][1:-2].split(b', '):
convo.append(line[1:-1])
convos.append(convo)

Expand All @@ -66,22 +66,22 @@ def question_answers(id2line, convos):
def prepare_dataset(questions, answers):
# create path to store all the train & test encoder & decoder
make_dir(config.PROCESSED_PATH)

# random convos to create the test set
test_ids = random.sample([i for i in range(len(questions))],config.TESTSET_SIZE)

filenames = ['train.enc', 'train.dec', 'test.enc', 'test.dec']
files = []
for filename in filenames:
files.append(open(os.path.join(config.PROCESSED_PATH, filename),'wb'))

for i in range(len(questions)):
if i in test_ids:
files[2].write(questions[i] + '\n')
files[3].write(answers[i] + '\n')
files[2].write(questions[i] + b'\n')
files[3].write(answers[i] + b'\n')
else:
files[0].write(questions[i] + '\n')
files[1].write(answers[i] + '\n')
files[0].write(questions[i] + b'\n')
files[1].write(answers[i] + b'\n')

for file in files:
file.close()
Expand All @@ -96,13 +96,13 @@ def make_dir(path):
def basic_tokenizer(line, normalize_digits=True):
""" A basic tokenizer to tokenize text into tokens.
Feel free to change this to suit your need. """
line = re.sub('<u>', '', line)
line = re.sub('</u>', '', line)
line = re.sub('\[', '', line)
line = re.sub('\]', '', line)
line = re.sub(b'<u>', b'', line)
line = re.sub(b'</u>', b'', line)
line = re.sub(b'\[', b'', line)
line = re.sub(b'\]', b'', line)
words = []
_WORD_SPLIT = re.compile(b"([.,!?\"'-<>:;)(])")
_DIGIT_RE = re.compile(r"\d")
_DIGIT_RE = re.compile(b"\d")
for fragment in line.strip().lower().split():
for token in re.split(_WORD_SPLIT, fragment):
if not token:
Expand All @@ -126,20 +126,20 @@ def build_vocab(filename, normalize_digits=True):

sorted_vocab = sorted(vocab, key=vocab.get, reverse=True)
with open(out_path, 'wb') as f:
f.write('<pad>' + '\n')
f.write('<unk>' + '\n')
f.write('<s>' + '\n')
f.write('<\s>' + '\n')
f.write(b'<pad>' + b'\n')
f.write(b'<unk>' + b'\n')
f.write(b'<s>' + b'\n')
f.write(b'<\s>' + b'\n')
index = 4
for word in sorted_vocab:
if vocab[word] < config.THRESHOLD:
with open('config.py', 'ab') as cf:
if filename[-3:] == 'enc':
cf.write('ENC_VOCAB = ' + str(index) + '\n')
cf.write(b'ENC_VOCAB = ' + str.encode(str(index)) + b'\n')
else:
cf.write('DEC_VOCAB = ' + str(index) + '\n')
cf.write(b'DEC_VOCAB = ' + str.encode(str(index)) + b'\n')
break
f.write(word + '\n')
f.write(word + b'\n')
index += 1

def load_vocab(vocab_path):
Expand All @@ -148,7 +148,7 @@ def load_vocab(vocab_path):
return words, {words[i]: i for i in range(len(words))}

def sentence2id(vocab, line):
return [vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(line)]
return [vocab.get(token, vocab[b'<unk>']) for token in basic_tokenizer(str.encode(line))]

def token2id(data, mode):
""" Convert all the tokens in the data into their corresponding
Expand All @@ -160,18 +160,18 @@ def token2id(data, mode):
_, vocab = load_vocab(os.path.join(config.PROCESSED_PATH, vocab_path))
in_file = open(os.path.join(config.PROCESSED_PATH, in_path), 'rb')
out_file = open(os.path.join(config.PROCESSED_PATH, out_path), 'wb')

lines = in_file.read().splitlines()
for line in lines:
if mode == 'dec': # we only care about '<s>' and </s> in encoder
ids = [vocab['<s>']]
ids = [vocab[b'<s>']]
else:
ids = []
ids.extend(sentence2id(vocab, line))
# ids.extend([vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(line)])
if mode == 'dec':
ids.append(vocab['<\s>'])
out_file.write(' '.join(str(id_) for id_ in ids) + '\n')
ids.append(vocab[b'<\s>'])
out_file.write(str.encode(' '.join(str(id_) for id_ in ids)) + b'\n')

def prepare_raw_data():
print('Preparing raw data into train set and test set ...')
Expand Down Expand Up @@ -253,4 +253,4 @@ def get_batch(data_bucket, bucket_id, batch_size=1):

if __name__ == '__main__':
prepare_raw_data()
process_data()
process_data()
38 changes: 19 additions & 19 deletions assignments/chatbot/model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
""" A neural chatbot using sequence to sequence model with
attentional decoder.
attentional decoder.

This is based on Google Translate Tensorflow model
This is based on Google Translate Tensorflow model
https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/

Sequence to sequence model by Cho et al.(2014)
Expand Down Expand Up @@ -30,7 +30,7 @@ def __init__(self, forward_only, batch_size):
print('Initialize new model')
self.fw_only = forward_only
self.batch_size = batch_size

def _create_placeholders(self):
# Feeds for inputs. It's a list of placeholders
print('Create placeholders')
Expand All @@ -43,7 +43,7 @@ def _create_placeholders(self):

# Our targets are decoder inputs shifted by one (to ignore <s> symbol)
self.targets = self.decoder_inputs[1:]

def _inference(self):
print('Create inference')
# If we use sampled softmax, we need an output projection.
Expand All @@ -53,9 +53,9 @@ def _inference(self):
b = tf.get_variable('proj_b', [config.DEC_VOCAB])
self.output_projection = (w, b)

def sampled_loss(inputs, labels):
def sampled_loss(labels, logits):
labels = tf.reshape(labels, [-1, 1])
return tf.nn.sampled_softmax_loss(tf.transpose(w), b, inputs, labels,
return tf.nn.sampled_softmax_loss(tf.transpose(w), b, labels, logits,
config.NUM_SAMPLES, config.DEC_VOCAB)
self.softmax_loss_function = sampled_loss

Expand All @@ -66,7 +66,7 @@ def _create_loss(self):
print('Creating loss... \nIt might take a couple of minutes depending on how many buckets you have.')
start = time.time()
def _seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
return tf.nn.seq2seq.embedding_attention_seq2seq(
return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
encoder_inputs, decoder_inputs, self.cell,
num_encoder_symbols=config.ENC_VOCAB,
num_decoder_symbols=config.DEC_VOCAB,
Expand All @@ -75,24 +75,24 @@ def _seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
feed_previous=do_decode)

if self.fw_only:
self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets(
self.encoder_inputs,
self.decoder_inputs,
self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
self.encoder_inputs,
self.decoder_inputs,
self.targets,
self.decoder_masks,
config.BUCKETS,
self.decoder_masks,
config.BUCKETS,
lambda x, y: _seq2seq_f(x, y, True),
softmax_loss_function=self.softmax_loss_function)
# If we use output projection, we need to project outputs for decoding.
if self.output_projection:
for bucket in range(len(config.BUCKETS)):
self.outputs[bucket] = [tf.matmul(output,
self.outputs[bucket] = [tf.matmul(output,
self.output_projection[0]) + self.output_projection[1]
for output in self.outputs[bucket]]
else:
self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets(
self.encoder_inputs,
self.decoder_inputs,
self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
self.encoder_inputs,
self.decoder_inputs,
self.targets,
self.decoder_masks,
config.BUCKETS,
Expand All @@ -112,12 +112,12 @@ def _creat_optimizer(self):
self.train_ops = []
start = time.time()
for bucket in range(len(config.BUCKETS)):
clipped_grads, norm = tf.clip_by_global_norm(tf.gradients(self.losses[bucket],

clipped_grads, norm = tf.clip_by_global_norm(tf.gradients(self.losses[bucket],
trainables),
config.MAX_GRAD_NORM)
self.gradient_norms.append(norm)
self.train_ops.append(self.optimizer.apply_gradients(zip(clipped_grads, trainables),
self.train_ops.append(self.optimizer.apply_gradients(zip(clipped_grads, trainables),
global_step=self.global_step))
print('Creating opt for bucket {} took {} seconds'.format(bucket, time.time() - start))
start = time.time()
Expand Down