From d7e39102c7e634828b0288fec1f87a1e6631a7b9 Mon Sep 17 00:00:00 2001 From: wwt Date: Sun, 30 Sep 2018 19:47:22 -0400 Subject: [PATCH 01/65] add differentiable_expected_bleu loss --- .../differentiable_expected_bleu/README.md | 40 ++++ .../config_iwslt14.py | 45 +++++ .../config_model.py | 29 +++ .../config_model_full.py | 127 +++++++++++++ .../config_train.py | 1 + .../differentiable_expected_bleu.py | 173 ++++++++++++++++++ .../prepare_data.py | 53 ++++++ texar/losses/__init__.py | 1 + texar/losses/differentiable_expected_bleu.py | 129 +++++++++++++ 9 files changed, 598 insertions(+) create mode 100644 examples/differentiable_expected_bleu/README.md create mode 100644 examples/differentiable_expected_bleu/config_iwslt14.py create mode 100644 examples/differentiable_expected_bleu/config_model.py create mode 100644 examples/differentiable_expected_bleu/config_model_full.py create mode 100644 examples/differentiable_expected_bleu/config_train.py create mode 100755 examples/differentiable_expected_bleu/differentiable_expected_bleu.py create mode 100644 examples/differentiable_expected_bleu/prepare_data.py create mode 100644 texar/losses/differentiable_expected_bleu.py diff --git a/examples/differentiable_expected_bleu/README.md b/examples/differentiable_expected_bleu/README.md new file mode 100644 index 00000000..9b481c5e --- /dev/null +++ b/examples/differentiable_expected_bleu/README.md @@ -0,0 +1,40 @@ +# Seq2seq Model # + +This example builds an attentional seq2seq model for machine translation. + +## Usage ## + +### Dataset ### + +Two example datasets are provided: + + * toy_copy: A small toy autoencoding dataset from [TF Seq2seq toolkit](https://github.com/google/seq2seq/tree/2500c26add91b079ca00cf1f091db5a99ddab9ae). + * iwslt14: The benchmark [IWSLT2014](https://sites.google.com/site/iwsltevaluation2014/home) (de-en) machine translation dataset. + +Download the data with the following cmds: + +``` +python prepare_data.py --data toy_copy +python prepare_data.py --data iwslt14 +``` + +### Train the model ### + +Train the model with the following cmd: + +``` +python seq2seq_attn.py --config_model config_model --config_data config_toy_copy +``` + +Here: + * `--config_model` specifies the model config. Note not to include the `.py` suffix. + * `--config_data` specifies the data config. + +[config_model.py](./config_model.py) specifies a single-layer seq2seq model with Luong attention and bi-directional RNN encoder. Hyperparameters taking default values can be omitted from the config file. + +For demonstration purpose, [config_model_full.py](./config_model_full.py) gives all possible hyperparameters for the model. The two config files will lead to the same model. + +## Results ## + +On the IWSLT14 dataset, using original target texts as reference(no `` in the reference), the model achieves `BLEU=21.66` within `10` epochs. + diff --git a/examples/differentiable_expected_bleu/config_iwslt14.py b/examples/differentiable_expected_bleu/config_iwslt14.py new file mode 100644 index 00000000..0c36dc73 --- /dev/null +++ b/examples/differentiable_expected_bleu/config_iwslt14.py @@ -0,0 +1,45 @@ + +num_epochs = 15 +display = 500 + +source_vocab_file = './data/iwslt14/vocab.de' +target_vocab_file = './data/iwslt14/vocab.en' + +train = { + 'batch_size': 32, + 'allow_smaller_final_batch': False, + 'source_dataset': { + "files": 'data/iwslt14/train.de', + 'vocab_file': source_vocab_file, + 'max_seq_length': 50 + }, + 'target_dataset': { + 'files': 'data/iwslt14/train.en', + 'vocab_file': target_vocab_file, + 'max_seq_length': 50 + } +} +val = { + 'batch_size': 32, + 'shuffle': False, + 'source_dataset': { + "files": 'data/iwslt14/valid.de', + 'vocab_file': source_vocab_file, + }, + 'target_dataset': { + 'files': 'data/iwslt14/valid.en', + 'vocab_file': target_vocab_file, + } +} +test = { + 'batch_size': 32, + 'shuffle': False, + 'source_dataset': { + "files": 'data/iwslt14/test.de', + 'vocab_file': source_vocab_file, + }, + 'target_dataset': { + 'files': 'data/iwslt14/test.en', + 'vocab_file': target_vocab_file, + } +} diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model.py new file mode 100644 index 00000000..8ef3c9b3 --- /dev/null +++ b/examples/differentiable_expected_bleu/config_model.py @@ -0,0 +1,29 @@ +# Attentional Seq2seq model. +# Hyperparameters not specified here will take the default values. + +num_units = 256 +beam_width = 10 + +embedder = { + 'dim': num_units +} +encoder = { + 'rnn_cell_fw': { + 'kwargs': { + 'num_units': num_units + } + } +} +decoder = { + 'rnn_cell': { + 'kwargs': { + 'num_units': num_units + }, + }, + 'attention': { + 'kwargs': { + 'num_units': num_units, + }, + 'attention_layer_size': num_units + } +} diff --git a/examples/differentiable_expected_bleu/config_model_full.py b/examples/differentiable_expected_bleu/config_model_full.py new file mode 100644 index 00000000..b59ebc4e --- /dev/null +++ b/examples/differentiable_expected_bleu/config_model_full.py @@ -0,0 +1,127 @@ +# The full possible hyperparameters for the attentional seq2seq model. +# Most of the hyperparameters take the default values and are not necessary to +# specify explicitly. The config here results in the same model with the +# `config_model.py`. + +num_units = 256 +beam_width = 10 + +# --------------------- Embedder --------------------- # +embedder = { + 'dim': num_units, + 'initializer': { + 'type': 'random_uniform_initializer', + 'kwargs': { + 'minval': -0.1, + 'maxval': 0.1, + 'seed': None + }, + }, + 'regularizer': { + 'type': 'L1L2', + 'kwargs': { + 'l1': 0, + 'l2': 0 + } + }, + 'dropout_rate': 0, + 'dropout_strategy': 'element', + 'trainable': True, + 'name': 'word_embedder' +} + +# --------------------- Encoder --------------------- # +encoder = { + 'rnn_cell_fw': { + 'type': 'LSTMCell', + 'kwargs': { + 'num_units': num_units, + 'forget_bias': 1.0, + 'activation': None, + # Other arguments go here for tf.nn.rnn_cell.LSTMCell + # ... + }, + 'num_layers': 1, + 'dropout': { + 'input_keep_prob': 1.0, + 'output_keep_prob': 1.0, + 'state_keep_prob': 1.0, + 'variational_recurrent': False, + 'input_size': [], + }, + 'residual': False, + 'highway': False, + }, + 'rnn_cell_bw': { + # The same possible hyperparameters as with 'rnn_cell_fw' + # ... + }, + 'rnn_cell_share_config': True, + 'output_layer_fw': { + 'num_layers': 0, + 'layer_size': 128, + 'activation': 'identity', + 'final_layer_activation': None, + 'other_dense_kwargs': None, + 'dropout_layer_ids': [], + 'dropout_rate': 0.5, + 'variational_dropout': False + }, + 'output_layer_bw': { + # The same possible hyperparameters as with 'output_layer_fw' + # ... + }, + 'output_layer_share_config': True, + 'name': 'bidirectional_rnn_encoder' +} + +# --------------------- Decoder --------------------- # +decoder = { + 'rnn_cell': { + 'type': 'LSTMCell', + 'kwargs': { + 'num_units': num_units, + 'forget_bias': 1.0, + 'activation': None, + # Other arguments go here for tf.nn.rnn_cell.LSTMCell + # ... + }, + 'num_layers': 1, + 'dropout': { + 'input_keep_prob': 1.0, + 'output_keep_prob': 1.0, + 'state_keep_prob': 1.0, + 'variational_recurrent': False, + 'input_size': [], + }, + 'residual': False, + 'highway': False, + }, + 'attention': { + 'type': 'LuongAttention', + 'kwargs': { + 'num_units': num_units, + 'scale': False, + 'probability_fn': None, + 'score_mask_value': None, + # Other arguments go here for tf.contrib.seq2seq.LuongAttention + # ... + }, + 'attention_layer_size': num_units, + 'alignment_history': False, + 'output_attention': True, + }, + 'helper_train': { + 'type': 'TrainingHelper', + 'kwargs': { + # Arguments go here for tf.contrib.seq2seq.TrainingHelper + } + }, + 'helper_infer': { + # The same possible hyperparameters as with 'helper_train' + # ... + }, + 'max_decoding_length_train': None, + 'max_decoding_length_infer': None, + 'name': 'attention_rnn_decoder' +} diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py new file mode 100644 index 00000000..e1b30a36 --- /dev/null +++ b/examples/differentiable_expected_bleu/config_train.py @@ -0,0 +1 @@ +tau = 1. diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py new file mode 100755 index 00000000..7ba581ec --- /dev/null +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +# Copyright 2018 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Attentional Seq2seq. +""" +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +#pylint: disable=invalid-name, too-many-arguments, too-many-locals + +import importlib +import tensorflow as tf +import texar as tx + +flags = tf.flags + +flags.DEFINE_string("config_train", "config_train", "The training config.") +flags.DEFINE_string("config_model", "config_model", "The model config.") +flags.DEFINE_string("config_data", "config_iwslt14", "The dataset config.") + +FLAGS = flags.FLAGS + +config_train = importlib.import_module(FLAGS.config_train) +config_model = importlib.import_module(FLAGS.config_model) +config_data = importlib.import_module(FLAGS.config_data) + + +def build_model(batch, train_data): + """Assembles the seq2seq model. + """ + source_embedder = tx.modules.WordEmbedder( + vocab_size=train_data.source_vocab.size, hparams=config_model.embedder) + + encoder = tx.modules.BidirectionalRNNEncoder( + hparams=config_model.encoder) + + enc_outputs, _ = encoder(source_embedder(batch['source_text_ids'])) + + target_embedder = tx.modules.WordEmbedder( + vocab_size=train_data.target_vocab.size, hparams=config_model.embedder) + + decoder = tx.modules.AttentionRNNDecoder( + memory=tf.concat(enc_outputs, axis=2), + memory_sequence_length=batch['source_length'], + vocab_size=train_data.target_vocab.size, + hparams=config_model.decoder) + + start_tokens = tf.ones_like(batch['target_length']) * \ + train_data.target_vocab.bos_token_id + end_token = train_data.target_vocab.eos_token_id + + helper = tx.modules.GumbelSoftmaxEmbeddingHelper( + embedding=target_embedder, + start_tokens=start_tokens, + end_token=end_token, + tau=config_train.tau) + + training_outputs, _, _ = decoder( + helper=helper, + max_decoding_length=50) + + train_op = tx.core.get_train_op( + tx.losses.differentiable_expected_bleu( + #TODO: decide whether to include BOS + labels=batch['target_text_ids'][:, 1:], + logits=training_outputs.logits, + sequence_length=batch['target_length'] - 1)) + + beam_search_outputs, _, _ = \ + tx.modules.beam_search_decode( + decoder_or_cell=decoder, + embedding=target_embedder, + start_tokens=start_tokens, + end_token=end_token, + beam_width=config_model.beam_width, + max_decoding_length=50) + + return train_op, beam_search_outputs + + +def main(): + """Entrypoint. + """ + train_data = tx.data.PairedTextData(hparams=config_data.train) + val_data = tx.data.PairedTextData(hparams=config_data.val) + test_data = tx.data.PairedTextData(hparams=config_data.test) + data_iterator = tx.data.TrainTestDataIterator( + train=train_data, val=val_data, test=test_data) + + batch = data_iterator.get_next() + + train_op, infer_outputs = build_model(batch, train_data) + + def _train_epoch(sess): + data_iterator.switch_to_train_data(sess) + + step = 0 + while True: + try: + loss = sess.run(train_op) + if step % config_data.display == 0: + print("step={}, loss={:.4f}".format(step, loss)) + step += 1 + except tf.errors.OutOfRangeError: + break + + def _eval_epoch(sess, mode): + if mode == 'val': + data_iterator.switch_to_val_data(sess) + else: + data_iterator.switch_to_test_data(sess) + + refs, hypos = [], [] + while True: + try: + fetches = [ + batch['target_text'][:, 1:], + infer_outputs.predicted_ids[:, :, 0] + ] + feed_dict = { + tx.global_mode(): tf.estimator.ModeKeys.EVAL + } + target_texts_ori, output_ids = \ + sess.run(fetches, feed_dict=feed_dict) + + target_texts = tx.utils.strip_special_tokens(target_texts_ori) + output_texts = tx.utils.map_ids_to_strs( + ids=output_ids, vocab=val_data.target_vocab) + + for hypo, ref in zip(output_texts, target_texts): + hypos.append(hypo) + refs.append([ref]) + except tf.errors.OutOfRangeError: + break + + return tx.evals.corpus_bleu_moses(list_of_references=refs, + hypotheses=hypos) + + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + sess.run(tf.local_variables_initializer()) + sess.run(tf.tables_initializer()) + + best_val_bleu = -1. + for i in range(config_data.num_epochs): + _train_epoch(sess) + + val_bleu = _eval_epoch(sess, 'val') + best_val_bleu = max(best_val_bleu, val_bleu) + print('val epoch={}, BLEU={:.4f}; best-ever={:.4f}'.format( + i, val_bleu, best_val_bleu)) + + test_bleu = _eval_epoch(sess, 'test') + print('test epoch={}, BLEU={:.4f}'.format(i, test_bleu)) + + print('=' * 50) + + +if __name__ == '__main__': + main() + diff --git a/examples/differentiable_expected_bleu/prepare_data.py b/examples/differentiable_expected_bleu/prepare_data.py new file mode 100644 index 00000000..a5cc357b --- /dev/null +++ b/examples/differentiable_expected_bleu/prepare_data.py @@ -0,0 +1,53 @@ +# Copyright 2018 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Downloads data. +""" +import tensorflow as tf +import texar as tx + +# pylint: disable=invalid-name + +flags = tf.flags + +flags.DEFINE_string("data", "iwslt14", "Data to download [iwslt14|toy_copy]") + +FLAGS = flags.FLAGS + +def prepare_data(): + """Downloads data. + """ + if FLAGS.data == 'iwslt14': + tx.data.maybe_download( + urls='https://drive.google.com/file/d/' + '1Vuv3bed10qUxrpldHdYoiWLzPKa4pNXd/view?usp=sharing', + path='./', + filenames='iwslt14.zip', + extract=True) + elif FLAGS.data == 'toy_copy': + tx.data.maybe_download( + urls='https://drive.google.com/file/d/' + '1fENE2rakm8vJ8d3voWBgW4hGlS6-KORW/view?usp=sharing', + path='./', + filenames='toy_copy.zip', + extract=True) + else: + raise ValueError('Unknown data: {}'.format(FLAGS.data)) + +def main(): + """Entrypoint. + """ + prepare_data() + +if __name__ == '__main__': + main() diff --git a/texar/losses/__init__.py b/texar/losses/__init__.py index c684911c..48586d40 100644 --- a/texar/losses/__init__.py +++ b/texar/losses/__init__.py @@ -27,3 +27,4 @@ from texar.losses.adv_losses import * from texar.losses.rewards import * from texar.losses.entropy import * +from texar.losses.differentiable_expected_bleu import * diff --git a/texar/losses/differentiable_expected_bleu.py b/texar/losses/differentiable_expected_bleu.py new file mode 100644 index 00000000..5974e036 --- /dev/null +++ b/texar/losses/differentiable_expected_bleu.py @@ -0,0 +1,129 @@ +# Copyright 2018 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Differentiable Expected BLEU loss +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +# pylint: disable=invalid-name, not-context-manager, protected-access, +# pylint: disable=too-many-arguments + +__all__ = [ + "differentiable_expected_bleu", +] + +def differentiable_expected_bleu(labels, + logits, + sequence_length, + time_major=False, + min_fn=lambda x: tf.minimum(1., x), + max_order=4, + weights=[.1, .3, .3, .3], + smooth_add=1e-9, + name=None): + """Computes sparse softmax cross entropy for each time step of sequence + predictions. + + Args: + labels: Target class indexes. I.e., classes are mutually exclusive + (each entry is in exactly one class). + + - If :attr:`time_major` is `False` (default), this must be\ + a Tensor of shape `[batch_size, max_time]`. + + - If `time_major` is `True`, this must be a Tensor of shape\ + `[max_time, batch_size].` + logits: Unscaled log probabilities. This must have the shape of + `[max_time, batch_size, num_classes]` or + `[batch_size, max_time, num_classes]` according to + the value of `time_major`. + sequence_length: A Tensor of shape `[batch_size]`. Time steps beyond + the respective sequence lengths will have zero losses. + time_major (bool): The shape format of the inputs. If `True`, + :attr:`labels` and :attr:`logits` must have shape + `[max_time, batch_size, ...]`. If `False` + (default), they must have shape `[batch_size, max_time, ...]`. + name (str, optional): A name for the operation. + + Returns: + A Tensor containing the loss of rank 0. + + Example: + + .. code-block:: python + + embedder = WordEmbedder(vocab_size=data.vocab.size) + decoder = BasicRNNDecoder(vocab_size=data.vocab.size) + outputs, _, _ = decoder( + decoding_strategy='train_greedy', + inputs=embedder(data_batch['text_ids']), + sequence_length=data_batch['length']-1) + + loss = sequence_sparse_softmax_cross_entropy( + labels=data_batch['text_ids'][:, 1:], + logits=outputs.logits, + sequence_length=data_batch['length']-1) + + """ # TODO: rewrite example + with tf.name_scope(name, "sequence_sparse_softmax_cross_entropy"): + X = logits + Y = labels + + if time_major: + X = tf.transpose(X, [1, 0, 2]) + Y = tf.transpose(Y, [1, 0]) + + sizeX = tf.shape(X)[1] + sizeY = tf.shape(Y)[1] + + XY = tf.batch_gather(X, tf.tile(tf.expand_dims(tf.to_int32(Y), 1), [1, sizeX, 1])) + YY = tf.to_float(tf.equal(tf.expand_dims(Y, 2), tf.expand_dims(Y, 1))) + + maskX = tf.sequence_mask( + sequence_length + 1, maxlen=sizeX + 1, dtype=tf.float32) + maskY = tf.sequence_mask( + sequence_length + 1, maxlen=sizeY + 1, dtype=tf.float32) + matchXY = tf.expand_dims(maskX, 2) * tf.expand_dims(maskY, 1) + matchYY = tf.minimum(tf.expand_dims(maskY, 2), + tf.expand_dims(maskY, 1)) + + tot = [] + o = [] + + for order in range(max_order): + matchXY = XY[:, : sizeX - order, : sizeY - order] * matchXY[:, 1:, 1:] + matchYY = YY[:, : sizeY - order, : sizeY - order] * matchYY[:, 1:, 1:] + cntYX = tf.reduce_sum(matchXY, 1, keepdims=True) + cntYY = tf.reduce_sum(matchYY, 1, keepdims=True) + o_order = tf.reduce_sum(tf.reduce_sum( + min_fn(cntYY / (cntYX - matchXY + 1)) + * matchXY / tf.maximum(1., cntYY), + 2), 1) + # in order to avoid dividing 0 + tot_order = tf.maximum(1, sequence_length - order) + tot.append(tot_order) + o.append(o_order) + + tot = tf.stack(tot, 1) + o = tf.stack(o, 1) + prec = tf.reduce_sum(o, 0) / tf.to_float(tf.reduce_sum(tot, 0)) + neglog_prec = -tf.log(prec + smooth_add) + loss = tf.reduce_sum(weights * neglog_prec, 0) + + return loss From b83a145a688d0614b2e3468814986186d6ee9962 Mon Sep 17 00:00:00 2001 From: wwt Date: Fri, 5 Oct 2018 01:10:27 -0400 Subject: [PATCH 02/65] modify DEBLEU loss interface from logits to probs --- texar/losses/differentiable_expected_bleu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/texar/losses/differentiable_expected_bleu.py b/texar/losses/differentiable_expected_bleu.py index 5974e036..521fc903 100644 --- a/texar/losses/differentiable_expected_bleu.py +++ b/texar/losses/differentiable_expected_bleu.py @@ -29,7 +29,7 @@ ] def differentiable_expected_bleu(labels, - logits, + probs, sequence_length, time_major=False, min_fn=lambda x: tf.minimum(1., x), @@ -82,7 +82,7 @@ def differentiable_expected_bleu(labels, """ # TODO: rewrite example with tf.name_scope(name, "sequence_sparse_softmax_cross_entropy"): - X = logits + X = probs Y = labels if time_major: From 87bd449fa65290c96d9bb4f96adfffc3e31c277a Mon Sep 17 00:00:00 2001 From: wwt Date: Fri, 5 Oct 2018 01:12:32 -0400 Subject: [PATCH 03/65] add TeacherMaskSoftmaxEmbeddingHelper --- .../differentiable_expected_bleu.py | 12 +- texar/modules/decoders/rnn_decoder_helpers.py | 104 ++++++++++++++++-- 2 files changed, 100 insertions(+), 16 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 7ba581ec..ea36d62c 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -61,10 +61,12 @@ def build_model(batch, train_data): train_data.target_vocab.bos_token_id end_token = train_data.target_vocab.eos_token_id - helper = tx.modules.GumbelSoftmaxEmbeddingHelper( + helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper( + inputs=batch['target_text_ids'], + sequence_length=batch['target_length']-1, embedding=target_embedder, - start_tokens=start_tokens, - end_token=end_token, + n_unmask=1, + n_mask=0, tau=config_train.tau) training_outputs, _, _ = decoder( @@ -75,8 +77,8 @@ def build_model(batch, train_data): tx.losses.differentiable_expected_bleu( #TODO: decide whether to include BOS labels=batch['target_text_ids'][:, 1:], - logits=training_outputs.logits, - sequence_length=batch['target_length'] - 1)) + probs=training_outputs.sample_id, + sequence_length=batch['target_length']-1)) beam_search_outputs, _, _ = \ tx.modules.beam_search_decode( diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py index 559f3c29..24ec60a4 100644 --- a/texar/modules/decoders/rnn_decoder_helpers.py +++ b/texar/modules/decoders/rnn_decoder_helpers.py @@ -38,6 +38,7 @@ "_get_training_helper", "GumbelSoftmaxEmbeddingHelper", "SoftmaxEmbeddingHelper", + "TeacherMaskSoftmaxEmbeddingHelper", ] def default_helper_train_hparams(): @@ -185,6 +186,17 @@ def _get_training_helper( #pylint: disable=invalid-name return helper +def get_embedding_and_fn(embedding): + if isinstance(embedding, EmbedderBase): + embedding = embedding.embedding + + if callable(embedding): + raise ValueError("`embedding` must be an embedding tensor or an " + "instance of subclass of `EmbedderBase`.") + else: + return embedding, (lambda ids: tf.nn.embedding_lookup(embedding, ids)) + + class SoftmaxEmbeddingHelper(TFHelper): """A helper that feeds softmax probabilities over vocabulary to the next step. @@ -215,17 +227,7 @@ class SoftmaxEmbeddingHelper(TFHelper): def __init__(self, embedding, start_tokens, end_token, tau, stop_gradient=False, use_finish=True): - if isinstance(embedding, EmbedderBase): - embedding = embedding.embedding - - if callable(embedding): - raise ValueError("`embedding` must be an embedding tensor or an " - "instance of subclass of `EmbedderBase`.") - else: - self._embedding = embedding - self._embedding_fn = ( - lambda ids: tf.nn.embedding_lookup(embedding, ids)) - + self._embedding, self._embedding_fn = get_embedding_and_fn(embedding) self._start_tokens = tf.convert_to_tensor( start_tokens, dtype=tf.int32, name="start_tokens") self._end_token = tf.convert_to_tensor( @@ -326,3 +328,83 @@ def sample(self, time, outputs, state, name=None): sample_ids = tf.stop_gradient(sample_ids_hard - sample_ids) \ + sample_ids return sample_ids + + +class TeacherMaskSoftmaxEmbeddingHelper(TFTrainingHelper): + def __init__(self, inputs, sequence_length, embedding, n_unmask, + n_mask, tau=1., time_major=False, seed=None, + stop_gradient=False): + super(TeacherMaskSoftmaxEmbeddingHelper, self).__init__( + inputs=inputs, + sequence_length=sequence_length, + time_major=time_major) + + self._embedding, self._embedding_fn = get_embedding_and_fn(embedding) + self._tau = tau + self._seed = seed + self._stop_gradient = stop_gradient + + self._zero_next_inputs = tf.zeros_like( + self._embedding_fn(self._zero_inputs)) + + self._n_unmask = tf.Variable(n_unmask, name='n_unmask') + self._n_mask = tf.Variable(n_mask, name='n_mask') + self._n_cycle = tf.add(self._n_unmask, self._n_mask, name='n_cycle') + self._new_n_unmask = tf.placeholder(shape=[], dtype=tf.int32) + self._new_n_mask = tf.placeholder(shape=[], dtype=tf.int32) + self._assign_n_unmask = tf.assign(self._n_unmask, self._new_n_unmask) + self._assign_n_mask = tf.assign(self._n_mask, self._new_n_mask) + self._n_shift = tf.random_uniform( + [], maxval=self._n_cycle, dtype=self._n_cycle.dtype, + seed=self._seed, name='n_shift') + + @property + def sample_ids_dtype(self): + return tf.float32 + + @property + def sample_ids_shape(self): + return self._embedding.get_shape()[:1] + + def assign_mask_pattern(self, n_unmask, n_mask, sess): + sess.run([self._assign_n_unmask, self._assign_n_mask], + feed_dict={self._new_n_unmask: n_unmask, + self._new_n_mask: n_mask}) + + def _is_masked(self, time): + return time % self._n_cycle < self._n_mask + + def initialize(self, name=None): + finished = tf.equal(0, self._sequence_length) + all_finished = tf.reduce_all(finished) + next_inputs = tf.cond( + all_finished, + lambda: self._zero_next_inputs, + lambda: self._embedding_fn(self._input_tas.read(0))) + return (finished, next_inputs) + + def sample(self, time, outputs, state, name=None): + """Returns `sample_id` of shape `[batch_size, vocab_size]`. + """ + next_time = time + 1 + sample_ids = tf.cond( + self._is_masked(next_time), + lambda: tf.one_hot(self._input_tas.read(next_time), + self._embedding.get_shape()[0]), + lambda: tf.nn.softmax(outputs / self._tau)) + return sample_ids + + def next_inputs(self, time, outputs, state, sample_ids, name=None): + next_time = time + 1 + finished = (next_time >= self._sequence_length) + all_finished = tf.reduce_all(finished) + if self._stop_gradient: + sample_ids = tf.stop_gradient(sample_ids) + next_inputs = tf.cond( + all_finished, + lambda: self._zero_next_inputs, + lambda: tf.cond( # for efficiency + self._is_masked(next_time), + lambda: self._embedding_fn(self._input_tas.read(next_time)), + lambda: tf.matmul(sample_ids, self._embedding))) + return (finished, next_inputs, state) From a730a254f36e9313b563d131b6c95bfc3b0418ca Mon Sep 17 00:00:00 2001 From: wwt Date: Fri, 5 Oct 2018 16:31:33 -0400 Subject: [PATCH 04/65] change API of sess --- texar/modules/decoders/rnn_decoder_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py index 24ec60a4..1d442c9a 100644 --- a/texar/modules/decoders/rnn_decoder_helpers.py +++ b/texar/modules/decoders/rnn_decoder_helpers.py @@ -366,7 +366,7 @@ def sample_ids_dtype(self): def sample_ids_shape(self): return self._embedding.get_shape()[:1] - def assign_mask_pattern(self, n_unmask, n_mask, sess): + def assign_mask_pattern(self, sess, n_unmask, n_mask): sess.run([self._assign_n_unmask, self._assign_n_mask], feed_dict={self._new_n_unmask: n_unmask, self._new_n_mask: n_mask}) From 86c2f9efd17e2d037684ef80f3c83322abdb2f7b Mon Sep 17 00:00:00 2001 From: wwt Date: Fri, 5 Oct 2018 16:32:21 -0400 Subject: [PATCH 05/65] add xe ; refine configs --- .../config_iwslt14.py | 21 ++- .../config_model.py | 6 +- .../config_train.py | 3 + .../differentiable_expected_bleu.py | 146 +++++++++--------- 4 files changed, 91 insertions(+), 85 deletions(-) diff --git a/examples/differentiable_expected_bleu/config_iwslt14.py b/examples/differentiable_expected_bleu/config_iwslt14.py index 0c36dc73..3fbff240 100644 --- a/examples/differentiable_expected_bleu/config_iwslt14.py +++ b/examples/differentiable_expected_bleu/config_iwslt14.py @@ -1,12 +1,8 @@ - -num_epochs = 15 -display = 500 - -source_vocab_file = './data/iwslt14/vocab.de' -target_vocab_file = './data/iwslt14/vocab.en' +source_vocab_file = 'data/iwslt14/vocab.de' +target_vocab_file = 'data/iwslt14/vocab.en' train = { - 'batch_size': 32, + 'batch_size': 80, 'allow_smaller_final_batch': False, 'source_dataset': { "files": 'data/iwslt14/train.de', @@ -17,10 +13,11 @@ 'files': 'data/iwslt14/train.en', 'vocab_file': target_vocab_file, 'max_seq_length': 50 - } + }, + 'allow_smaller_final_batch': False, } val = { - 'batch_size': 32, + 'batch_size': 80, 'shuffle': False, 'source_dataset': { "files": 'data/iwslt14/valid.de', @@ -29,10 +26,10 @@ 'target_dataset': { 'files': 'data/iwslt14/valid.en', 'vocab_file': target_vocab_file, - } + }, } test = { - 'batch_size': 32, + 'batch_size': 80, 'shuffle': False, 'source_dataset': { "files": 'data/iwslt14/test.de', @@ -41,5 +38,5 @@ 'target_dataset': { 'files': 'data/iwslt14/test.en', 'vocab_file': target_vocab_file, - } + }, } diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model.py index 8ef3c9b3..3ba0c867 100644 --- a/examples/differentiable_expected_bleu/config_model.py +++ b/examples/differentiable_expected_bleu/config_model.py @@ -1,11 +1,11 @@ # Attentional Seq2seq model. # Hyperparameters not specified here will take the default values. -num_units = 256 -beam_width = 10 +num_units = 1000 +embedding_dim = 500 embedder = { - 'dim': num_units + 'dim': embedding_dim } encoder = { 'rnn_cell_fw': { diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py index e1b30a36..f0669550 100644 --- a/examples/differentiable_expected_bleu/config_train.py +++ b/examples/differentiable_expected_bleu/config_train.py @@ -1 +1,4 @@ +max_epochs = 1000 tau = 1. +infer_beam_width = 1 +infer_max_decoding_length = 50 diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index ea36d62c..37fcb860 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -37,6 +37,13 @@ config_data = importlib.import_module(FLAGS.config_data) +def get_data_loader(sess, fetches, feed_dict): + while True: + try: + yield sess.run(fetches, feed_dict=feed_dict) + except tf.errors.OutOfRangeError: + break + def build_model(batch, train_data): """Assembles the seq2seq model. """ @@ -57,39 +64,51 @@ def build_model(batch, train_data): vocab_size=train_data.target_vocab.size, hparams=config_model.decoder) - start_tokens = tf.ones_like(batch['target_length']) * \ - train_data.target_vocab.bos_token_id - end_token = train_data.target_vocab.eos_token_id + # cross-entropy + teacher-forcing pretraining + tf_outputs, _, _ = decoder( + decoding_strategy='train_greedy', + inputs=target_embedder(batch['target_text_ids'][:, :-1]), + sequence_length=batch['target_length']-1) - helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper( - inputs=batch['target_text_ids'], + train_xe_op = tx.core.get_train_op( + tx.losses.sequence_sparse_softmax_cross_entropy( + labels=batch['target_text_ids'][:, 1:], + logits=tf_outputs.logits, + sequence_length=batch['target_length']-1)) + + # teacher mask + DEBLEU fine-tuning + tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper( + inputs=batch['target_text_ids'][:, :-1], sequence_length=batch['target_length']-1, embedding=target_embedder, n_unmask=1, n_mask=0, tau=config_train.tau) - training_outputs, _, _ = decoder( - helper=helper, - max_decoding_length=50) + tm_outputs, _, _ = decoder( + helper=tm_helper) - train_op = tx.core.get_train_op( + train_debleu_op = tx.core.get_train_op( tx.losses.differentiable_expected_bleu( #TODO: decide whether to include BOS labels=batch['target_text_ids'][:, 1:], - probs=training_outputs.sample_id, + probs=tm_outputs.sample_id, sequence_length=batch['target_length']-1)) - beam_search_outputs, _, _ = \ - tx.modules.beam_search_decode( - decoder_or_cell=decoder, - embedding=target_embedder, - start_tokens=start_tokens, - end_token=end_token, - beam_width=config_model.beam_width, - max_decoding_length=50) + # inference: beam search decoding + start_tokens = tf.ones_like(batch['target_length']) * \ + train_data.target_vocab.bos_token_id + end_token = train_data.target_vocab.eos_token_id + + bs_outputs, _, _ = tx.modules.beam_search_decode( + decoder_or_cell=decoder, + embedding=target_embedder, + start_tokens=start_tokens, + end_token=end_token, + beam_width=config_train.infer_beam_width, + max_decoding_length=config_train.infer_max_decoding_length) - return train_op, beam_search_outputs + return train_xe_op, train_debleu_op, bs_outputs def main(): @@ -98,55 +117,47 @@ def main(): train_data = tx.data.PairedTextData(hparams=config_data.train) val_data = tx.data.PairedTextData(hparams=config_data.val) test_data = tx.data.PairedTextData(hparams=config_data.test) - data_iterator = tx.data.TrainTestDataIterator( - train=train_data, val=val_data, test=test_data) + data_iterator = tx.data.FeedableDataIterator( + {'train': train_data, 'val': val_data, 'test': test_data}) - batch = data_iterator.get_next() + data_batch = data_iterator.get_next() - train_op, infer_outputs = build_model(batch, train_data) + train_xe_op, train_debleu_op, infer_outputs = \ + build_model(data_batch, train_data) def _train_epoch(sess): - data_iterator.switch_to_train_data(sess) - - step = 0 - while True: - try: - loss = sess.run(train_op) - if step % config_data.display == 0: - print("step={}, loss={:.4f}".format(step, loss)) - step += 1 - except tf.errors.OutOfRangeError: - break + data_iterator.restart_dataset(sess, 'train') + feed_dict = { + tx.global_mode(): tf.estimator.ModeKeys.TRAIN, + data_iterator.handle: data_iterator.get_handle(sess, 'train') + } - def _eval_epoch(sess, mode): - if mode == 'val': - data_iterator.switch_to_val_data(sess) - else: - data_iterator.switch_to_test_data(sess) - - refs, hypos = [], [] - while True: - try: - fetches = [ - batch['target_text'][:, 1:], - infer_outputs.predicted_ids[:, :, 0] - ] - feed_dict = { - tx.global_mode(): tf.estimator.ModeKeys.EVAL - } - target_texts_ori, output_ids = \ - sess.run(fetches, feed_dict=feed_dict) - - target_texts = tx.utils.strip_special_tokens(target_texts_ori) - output_texts = tx.utils.map_ids_to_strs( - ids=output_ids, vocab=val_data.target_vocab) - - for hypo, ref in zip(output_texts, target_texts): - hypos.append(hypo) - refs.append([ref]) - except tf.errors.OutOfRangeError: - break + for batch_i, batch in \ + enumerate(get_data_loader(sess, data_batch, feed_dict)): + loss = sess.run(train_xe_op, feed_dict=feed_dict) + def _eval_epoch(sess, mode): + data_iterator.restart_dataset(sess, mode) + feed_dict = { + tx.global_mode(): tf.estimator.ModeKeys.EVAL, + data_iterator.handle: data_iterator.get_handle(sess, mode) + } + + ref_hypo_pairs = [] + fetches = [ + batch['target_text'][:, 1:], + infer_outputs.predicted_ids[:, :, 0] + ] + for target_texts_ori, output_ids in \ + get_data_loader(sess, fetches, feed_dict): + target_texts = tx.utils.strip_special_tokens(target_texts_ori) + output_texts = tx.utils.map_ids_to_strs( + ids=output_ids, vocab=val_data.target_vocab) + + ref_hypo_pairs.extend( + zip(map(lambda x: [x], target_texts), output_texts)) + + refs, hypos = zip(*ref_hypo_pairs) return tx.evals.corpus_bleu_moses(list_of_references=refs, hypotheses=hypos) @@ -155,19 +166,14 @@ def _eval_epoch(sess, mode): sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) - best_val_bleu = -1. - for i in range(config_data.num_epochs): + epoch = 0 + while epoch < config_train.max_epochs: _train_epoch(sess) + epoch += 1 val_bleu = _eval_epoch(sess, 'val') - best_val_bleu = max(best_val_bleu, val_bleu) - print('val epoch={}, BLEU={:.4f}; best-ever={:.4f}'.format( - i, val_bleu, best_val_bleu)) test_bleu = _eval_epoch(sess, 'test') - print('test epoch={}, BLEU={:.4f}'.format(i, test_bleu)) - - print('=' * 50) if __name__ == '__main__': From e10c78b75b73fab78840d5aa3a6b447f28ab3020 Mon Sep 17 00:00:00 2001 From: wwt Date: Fri, 5 Oct 2018 21:41:55 -0400 Subject: [PATCH 06/65] fix a typo in doc --- texar/core/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texar/core/optimization.py b/texar/core/optimization.py index 1e24e9b3..af48c17d 100644 --- a/texar/core/optimization.py +++ b/texar/core/optimization.py @@ -125,7 +125,7 @@ def default_optimization_hparams(): :tf_main:`tf.clip_by_average_norm `, etc. "type" specifies the gradient clip function, and can be a function, - or its name or mudule path. If function name is provided, the + or its name or module path. If function name is provided, the function must be from module :tf_main:`tf < >` or :mod:`texar.custom`. "kwargs" specifies keyword arguments to the function, except arguments From bdbca3b5ff98d7d2eae51f04c32569923847059a Mon Sep 17 00:00:00 2001 From: wwt Date: Fri, 5 Oct 2018 22:47:54 -0400 Subject: [PATCH 07/65] add summary and checkpoints ; add train configs --- .../config_train.py | 25 ++++++++ .../differentiable_expected_bleu.py | 63 ++++++++++++++----- 2 files changed, 72 insertions(+), 16 deletions(-) diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py index f0669550..2cbe8dd7 100644 --- a/examples/differentiable_expected_bleu/config_train.py +++ b/examples/differentiable_expected_bleu/config_train.py @@ -1,4 +1,29 @@ max_epochs = 1000 +steps_per_eval = 500 tau = 1. infer_beam_width = 1 infer_max_decoding_length = 50 + +train_xe = { + "optimizer": { + "type": "AdamOptimizer", + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, +} + +train_debleu = { + "optimizer": { + "type": "AdamOptimizer", + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, +} diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 37fcb860..ee042be0 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -21,6 +21,7 @@ #pylint: disable=invalid-name, too-many-arguments, too-many-locals import importlib +import os import tensorflow as tf import texar as tx @@ -74,7 +75,8 @@ def build_model(batch, train_data): tx.losses.sequence_sparse_softmax_cross_entropy( labels=batch['target_text_ids'][:, 1:], logits=tf_outputs.logits, - sequence_length=batch['target_length']-1)) + sequence_length=batch['target_length']-1), + hparams=config_train.train_xe) # teacher mask + DEBLEU fine-tuning tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper( @@ -93,7 +95,8 @@ def build_model(batch, train_data): #TODO: decide whether to include BOS labels=batch['target_text_ids'][:, 1:], probs=tm_outputs.sample_id, - sequence_length=batch['target_length']-1)) + sequence_length=batch['target_length']-1), + hparams=config_train.train_debleu) # inference: beam search decoding start_tokens = tf.ones_like(batch['target_length']) * \ @@ -122,21 +125,29 @@ def main(): data_batch = data_iterator.get_next() + global_step = tf.train.create_global_step() + train_xe_op, train_debleu_op, infer_outputs = \ build_model(data_batch, train_data) - def _train_epoch(sess): + merged_summary = tf.summary.merge_all() + + saver = tf.train.Saver(max_to_keep=None) + + def _train_epoch(sess, summary_writer): data_iterator.restart_dataset(sess, 'train') feed_dict = { tx.global_mode(): tf.estimator.ModeKeys.TRAIN, data_iterator.handle: data_iterator.get_handle(sess, 'train') } - for batch_i, batch in \ - enumerate(get_data_loader(sess, data_batch, feed_dict)): - loss = sess.run(train_xe_op, feed_dict=feed_dict) + for loss, summary, step in get_data_loader( + sess, (train_xe_op, merged_summary, global_step), feed_dict): + summary_writer.add_summary(summary, step) + if step % config_train.steps_per_eval == 0: + _eval_epoch(sess, summary_writer, 'val') - def _eval_epoch(sess, mode): + def _eval_epoch(sess, summary_writer, mode): data_iterator.restart_dataset(sess, mode) feed_dict = { tx.global_mode(): tf.estimator.ModeKeys.EVAL, @@ -145,7 +156,7 @@ def _eval_epoch(sess, mode): ref_hypo_pairs = [] fetches = [ - batch['target_text'][:, 1:], + data_batch['target_text'][:, 1:], infer_outputs.predicted_ids[:, :, 0] ] for target_texts_ori, output_ids in \ @@ -158,22 +169,42 @@ def _eval_epoch(sess, mode): zip(map(lambda x: [x], target_texts), output_texts)) refs, hypos = zip(*ref_hypo_pairs) - return tx.evals.corpus_bleu_moses(list_of_references=refs, + bleu = tx.evals.corpus_bleu_moses(list_of_references=refs, hypotheses=hypos) + step = tf.train.global_step(sess, global_step) + summary = tf.Summary() + summary.value.add(tag='{}/BLEU'.format(mode), simple_value=bleu) + summary_writer.add_summary(summary, step) + return bleu + best_val_bleu = -1 with tf.Session() as sess: - sess.run(tf.global_variables_initializer()) - sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) + ckpt_name = 'ckpt/model.ckpt' + if os.path.exists('ckpt') and tf.train.checkpoint_exists(ckpt_name): + print('restoring from {} ...'.format(ckpt_name)) + saver.restore(sess, ckpt_name) + else: + sess.run(tf.global_variables_initializer()) + sess.run(tf.local_variables_initializer()) + + summary_writer = tf.summary.FileWriter('log', sess.graph) epoch = 0 while epoch < config_train.max_epochs: - _train_epoch(sess) + val_bleu = _eval_epoch(sess, summary_writer, 'val') + if val_bleu > best_val_bleu: + best_val_bleu = val_bleu + print('epoch: {}, step: {}, best val bleu: {}'.format( + epoch, + tf.train.global_step(sess, global_step), + best_val_bleu)) + saved_path = saver.save(sess, 'ckpt/best.ckpt') + print('saved to {}'.format(saved_path)) + _train_epoch(sess, summary_writer) epoch += 1 - - val_bleu = _eval_epoch(sess, 'val') - - test_bleu = _eval_epoch(sess, 'test') + saved_path = saver.save(sess, 'ckpt/model.ckpt') + print('saved to {}'.format(saved_path)) if __name__ == '__main__': From ffbf14d99207c2ff4c4dfbfe95816232c28cbf5b Mon Sep 17 00:00:00 2001 From: wwt Date: Sat, 6 Oct 2018 13:46:35 -0400 Subject: [PATCH 08/65] remove duplicated config --- examples/differentiable_expected_bleu/config_iwslt14.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/differentiable_expected_bleu/config_iwslt14.py b/examples/differentiable_expected_bleu/config_iwslt14.py index 3fbff240..e6b40e97 100644 --- a/examples/differentiable_expected_bleu/config_iwslt14.py +++ b/examples/differentiable_expected_bleu/config_iwslt14.py @@ -14,7 +14,6 @@ 'vocab_file': target_vocab_file, 'max_seq_length': 50 }, - 'allow_smaller_final_batch': False, } val = { 'batch_size': 80, From 7e5b92bbfb42b2547d833126c48a353512c38176 Mon Sep 17 00:00:00 2001 From: wwt Date: Sat, 6 Oct 2018 17:34:49 -0400 Subject: [PATCH 09/65] copy tf.batch_gather --- texar/losses/differentiable_expected_bleu.py | 75 +++++++++++++++++++- 1 file changed, 72 insertions(+), 3 deletions(-) diff --git a/texar/losses/differentiable_expected_bleu.py b/texar/losses/differentiable_expected_bleu.py index 521fc903..1f3cc1f7 100644 --- a/texar/losses/differentiable_expected_bleu.py +++ b/texar/losses/differentiable_expected_bleu.py @@ -28,6 +28,73 @@ "differentiable_expected_bleu", ] +def batch_gather(params, indices, name=None): + """This function is copied and modified from tensorflow 11.0. + Gather slices from `params` according to `indices` with leading batch dims. + This operation assumes that the leading dimensions of `indices` are dense, + and the gathers on the axis corresponding to the last dimension of `indices`. + More concretely it computes: + result[i1, ..., in] = params[i1, ..., in-1, indices[i1, ..., in]] + Therefore `params` should be a Tensor of shape [A1, ..., AN, B1, ..., BM], + `indices` should be a Tensor of shape [A1, ..., AN-1, C] and `result` will be + a Tensor of size `[A1, ..., AN-1, C, B1, ..., BM]`. + In the case in which indices is a 1D tensor, this operation is equivalent to + `tf.gather`. + See also `tf.gather` and `tf.gather_nd`. + Args: + params: A Tensor. The tensor from which to gather values. + indices: A Tensor. Must be one of the following types: int32, int64. Index + tensor. Must be in range `[0, params.shape[axis]`, where `axis` is the + last dimension of `indices` itself. + name: A name for the operation (optional). + Returns: + A Tensor. Has the same type as `params`. + Raises: + ValueError: if `indices` has an unknown shape. + """ + + with tf.name_scope(name): + indices = tf.convert_to_tensor(indices, name="indices") + params = tf.convert_to_tensor(params, name="params") + indices_shape = tf.shape(indices) + params_shape = tf.shape(params) + + ndims = indices.shape.ndims + if ndims is None: + raise ValueError("batch_gather does not allow indices with unknown " + "shape.") + batch_indices = indices + indices_dtype = indices.dtype.base_dtype + accum_dim_value = tf.ones((), dtype=indices_dtype) + # Use correct type for offset index computation + casted_params_shape = tf.cast(params_shape, indices_dtype) + for dim in range(ndims-1, 0, -1): + dim_value = casted_params_shape[dim-1] + accum_dim_value *= casted_params_shape[dim] + start = tf.zeros((), dtype=indices_dtype) + step = tf.ones((), dtype=indices_dtype) + dim_indices = tf.range(start, dim_value, step) + dim_indices *= accum_dim_value + dim_shape = tf.stack([1] * (dim - 1) + [dim_value] + [1] * (ndims - dim), + axis=0) + batch_indices += tf.reshape(dim_indices, dim_shape) + + flat_indices = tf.reshape(batch_indices, [-1]) + outer_shape = params_shape[ndims:] + flat_inner_shape = tf.reduce_prod(params_shape[:ndims]) + + flat_params = tf.reshape( + params, tf.concat([[flat_inner_shape], outer_shape], axis=0)) + flat_result = tf.gather(flat_params, flat_indices) + result = tf.reshape( + flat_result, tf.concat([indices_shape, outer_shape], axis=0)) + final_shape = indices.get_shape()[:ndims-1].merge_with( + params.get_shape()[:ndims -1]) + final_shape = final_shape.concatenate(indices.get_shape()[ndims-1]) + final_shape = final_shape.concatenate(params.get_shape()[ndims:]) + result.set_shape(final_shape) + return result + def differentiable_expected_bleu(labels, probs, sequence_length, @@ -92,7 +159,7 @@ def differentiable_expected_bleu(labels, sizeX = tf.shape(X)[1] sizeY = tf.shape(Y)[1] - XY = tf.batch_gather(X, tf.tile(tf.expand_dims(tf.to_int32(Y), 1), [1, sizeX, 1])) + XY = batch_gather(X, tf.tile(tf.expand_dims(Y, 1), [1, sizeX, 1])) YY = tf.to_float(tf.equal(tf.expand_dims(Y, 2), tf.expand_dims(Y, 1))) maskX = tf.sequence_mask( @@ -107,8 +174,10 @@ def differentiable_expected_bleu(labels, o = [] for order in range(max_order): - matchXY = XY[:, : sizeX - order, : sizeY - order] * matchXY[:, 1:, 1:] - matchYY = YY[:, : sizeY - order, : sizeY - order] * matchYY[:, 1:, 1:] + matchXY = XY[:, : sizeX - order, : sizeY - order] * \ + matchXY[:, 1:, 1:] + matchYY = YY[:, : sizeY - order, : sizeY - order] * \ + matchYY[:, 1:, 1:] cntYX = tf.reduce_sum(matchXY, 1, keepdims=True) cntYY = tf.reduce_sum(matchYY, 1, keepdims=True) o_order = tf.reduce_sum(tf.reduce_sum( From d8f7449c2830679139e1cf5d4d5df0f8dd89a327 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sat, 6 Oct 2018 17:41:09 -0400 Subject: [PATCH 10/65] config dataset val=test --- examples/differentiable_expected_bleu/config_iwslt14.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/differentiable_expected_bleu/config_iwslt14.py b/examples/differentiable_expected_bleu/config_iwslt14.py index 3fbff240..cfcc6d71 100644 --- a/examples/differentiable_expected_bleu/config_iwslt14.py +++ b/examples/differentiable_expected_bleu/config_iwslt14.py @@ -14,7 +14,6 @@ 'vocab_file': target_vocab_file, 'max_seq_length': 50 }, - 'allow_smaller_final_batch': False, } val = { 'batch_size': 80, @@ -40,3 +39,4 @@ 'vocab_file': target_vocab_file, }, } +val = test From 9bdbe09780f00e9eaecf9cc70cd2f5119dddd85d Mon Sep 17 00:00:00 2001 From: wwt Date: Sun, 7 Oct 2018 01:30:21 -0400 Subject: [PATCH 11/65] add triggers ; now the whole code is runnable --- .../config_train.py | 4 + .../differentiable_expected_bleu.py | 48 +++++-- .../differentiable_expected_bleu/triggers.py | 128 ++++++++++++++++++ texar/modules/decoders/rnn_decoder_helpers.py | 5 +- 4 files changed, 170 insertions(+), 15 deletions(-) create mode 100644 examples/differentiable_expected_bleu/triggers.py diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py index 2cbe8dd7..ac23d115 100644 --- a/examples/differentiable_expected_bleu/config_train.py +++ b/examples/differentiable_expected_bleu/config_train.py @@ -4,6 +4,10 @@ infer_beam_width = 1 infer_max_decoding_length = 50 +mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] +threshold_steps = 10000 +wait_steps = 10000 + train_xe = { "optimizer": { "type": "AdamOptimizer", diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index ee042be0..e1db02a6 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -24,19 +24,23 @@ import os import tensorflow as tf import texar as tx +from triggers import BestEverConvergenceTrigger flags = tf.flags flags.DEFINE_string("config_train", "config_train", "The training config.") flags.DEFINE_string("config_model", "config_model", "The model config.") flags.DEFINE_string("config_data", "config_iwslt14", "The dataset config.") +flags.DEFINE_boolean("pretraining", False, "whether pretraining") FLAGS = flags.FLAGS config_train = importlib.import_module(FLAGS.config_train) config_model = importlib.import_module(FLAGS.config_model) config_data = importlib.import_module(FLAGS.config_data) +pretraining = FLAGS.pretraining +mask_patterns = config_train.mask_patterns def get_data_loader(sess, fetches, feed_dict): while True: @@ -80,11 +84,12 @@ def build_model(batch, train_data): # teacher mask + DEBLEU fine-tuning tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper( - inputs=batch['target_text_ids'][:, :-1], + # must not remove last token, since it may be used as mask + inputs=batch['target_text_ids'], sequence_length=batch['target_length']-1, embedding=target_embedder, - n_unmask=1, - n_mask=0, + n_unmask=mask_patterns[0][0], + n_mask=mask_patterns[0][1], tau=config_train.tau) tm_outputs, _, _ = decoder( @@ -111,7 +116,7 @@ def build_model(batch, train_data): beam_width=config_train.infer_beam_width, max_decoding_length=config_train.infer_max_decoding_length) - return train_xe_op, train_debleu_op, bs_outputs + return train_xe_op, train_debleu_op, tm_helper, bs_outputs def main(): @@ -127,14 +132,15 @@ def main(): global_step = tf.train.create_global_step() - train_xe_op, train_debleu_op, infer_outputs = \ + train_xe_op, train_debleu_op, tm_helper, infer_outputs = \ build_model(data_batch, train_data) + train_op = train_xe_op if pretraining else train_debleu_op merged_summary = tf.summary.merge_all() saver = tf.train.Saver(max_to_keep=None) - def _train_epoch(sess, summary_writer): + def _train_epoch(sess, summary_writer, train_op, trigger): data_iterator.restart_dataset(sess, 'train') feed_dict = { tx.global_mode(): tf.estimator.ModeKeys.TRAIN, @@ -142,12 +148,12 @@ def _train_epoch(sess, summary_writer): } for loss, summary, step in get_data_loader( - sess, (train_xe_op, merged_summary, global_step), feed_dict): + sess, (train_op, merged_summary, global_step), feed_dict): summary_writer.add_summary(summary, step) if step % config_train.steps_per_eval == 0: - _eval_epoch(sess, summary_writer, 'val') + _eval_epoch(sess, summary_writer, 'val', trigger) - def _eval_epoch(sess, summary_writer, mode): + def _eval_epoch(sess, summary_writer, mode, trigger): data_iterator.restart_dataset(sess, mode) feed_dict = { tx.global_mode(): tf.estimator.ModeKeys.EVAL, @@ -171,10 +177,17 @@ def _eval_epoch(sess, summary_writer, mode): refs, hypos = zip(*ref_hypo_pairs) bleu = tx.evals.corpus_bleu_moses(list_of_references=refs, hypotheses=hypos) + step = tf.train.global_step(sess, global_step) summary = tf.Summary() summary.value.add(tag='{}/BLEU'.format(mode), simple_value=bleu) summary_writer.add_summary(summary, step) + + if trigger is not None: + triggered, _ = trigger(step, bleu) + if triggered: + print('triggered!') + return bleu best_val_bleu = -1 @@ -190,9 +203,22 @@ def _eval_epoch(sess, summary_writer, mode): summary_writer = tf.summary.FileWriter('log', sess.graph) + if pretraining: + trigger = None + else: + action = map( + lambda pattern: tm_helper.assign_mask_pattern( + sess, pattern[0], pattern[1]), + mask_patterns[1:]) + trigger = BestEverConvergenceTrigger( + action, + config_train.threshold_steps, + config_train.wait_steps, + default=None) + epoch = 0 while epoch < config_train.max_epochs: - val_bleu = _eval_epoch(sess, summary_writer, 'val') + val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger) if val_bleu > best_val_bleu: best_val_bleu = val_bleu print('epoch: {}, step: {}, best val bleu: {}'.format( @@ -201,7 +227,7 @@ def _eval_epoch(sess, summary_writer, mode): best_val_bleu)) saved_path = saver.save(sess, 'ckpt/best.ckpt') print('saved to {}'.format(saved_path)) - _train_epoch(sess, summary_writer) + _train_epoch(sess, summary_writer, train_op, trigger) epoch += 1 saved_path = saver.save(sess, 'ckpt/model.ckpt') print('saved to {}'.format(saved_path)) diff --git a/examples/differentiable_expected_bleu/triggers.py b/examples/differentiable_expected_bleu/triggers.py new file mode 100644 index 00000000..9b879048 --- /dev/null +++ b/examples/differentiable_expected_bleu/triggers.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +# Copyright 2018 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Attentional Seq2seq. +""" +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +#pylint: disable=invalid-name, too-many-arguments, too-many-locals + +try: + import queue +except ImportError: + import Queue as queue + +DEFAULT = object() + +class Trigger(object): + + def __init__(self, action, default=DEFAULT): + """action is an iterator that iteratively do a sequence of action and + return result values. default is used as result value when action is + exhausted. + """ + self._action = action + self._default = default + + def predicate(self, *args, **kwargs): + """This function returns True when we think we should do something. + """ + raise NotImplementedError + + def __call__(self, *args, **kwargs): + pred = self.predicate(*args, **kwargs) + if pred: + ret = next(self._action) if self._default is DEFAULT else \ + next(self._action, self._default) + else: + ret = None + return pred, ret + + +class ScheduledStepsTrigger(Trigger): + + def __init__(self, action, steps, default=DEFAULT): + """steps should be in increasing order. + """ + super(ScheduledTrigger, self).__init__(action, default) + self._steps = iter(steps) + self._advance_steps() + + def _advance_steps(self): + self._next_step = next(step, None) + + def predicate(self, step): + while self._next_step is not None and step < self._next_step: + self._advance_steps() + if self._next_step is not None and step == self._next_step: + return True + return False + + +class BestEverConvergenceTrigger(Trigger): + + def __init__(self, action, threshold_steps, wait_steps, default=DEFAULT): + super(BestEverConvergenceTrigger, self).__init__(action, default) + self._threshold_steps = threshold_steps + self._wait_steps = wait_steps + self._last_triggered_step = None + self._best_ever_step = None + self._best_ever_score = None + + def predicate(self, step, score): + if self._best_ever_score is None or self._best_ever_score < score: + self._best_ever_score = score + self._best_ever_step = step + + if (self._last_triggered_step is None or + step - self._last_triggered_step >= self._wait_steps) and \ + step - self._best_ever_step >= self._threshold_steps: + self._last_triggered_step = step + return True + return False + + +class MovingAverageConvergenceTrigger(Trigger): + + def __init__(self, action, n, threshold, wait_steps, default=DEFAULT): + super(MovingAverageConvergenceTrigger, self).__init__(action, default) + self._n = n + self._threshold = threshold + self._wait_steps = wait_steps + self._last_triggered_step = None + self._head_queue = queue.Queue(self._n) + self._head_sum = 0 + self._rear_queue = queue.Queue(self._n) + self._rear_sum = 0 + + def predicate(self, step, score): + if self._head_queue.full(): + e = self._head_queue.get() + self._head_sum -= e + if self._rear_queue.full(): + self._rear_sum -= self._rear_queue.get() + self._rear_queue.put(e) + self._rear_sum += e + self._head_queue.put(score) + self._head_sum += score + + if (self._last_triggered_step is None or + step - self._last_triggered_step >= self._wait_steps) and \ + self._head_queue.full() and self._rear_queue.full() and \ + self._head_sum - self._rear_sum <= self._n * self._threshold: + self._last_triggered_step = step + return True + return False diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py index 1d442c9a..a9e2bd1c 100644 --- a/texar/modules/decoders/rnn_decoder_helpers.py +++ b/texar/modules/decoders/rnn_decoder_helpers.py @@ -403,8 +403,5 @@ def next_inputs(self, time, outputs, state, sample_ids, name=None): next_inputs = tf.cond( all_finished, lambda: self._zero_next_inputs, - lambda: tf.cond( # for efficiency - self._is_masked(next_time), - lambda: self._embedding_fn(self._input_tas.read(next_time)), - lambda: tf.matmul(sample_ids, self._embedding))) + lambda: tf.matmul(sample_ids, self._embedding)) return (finished, next_inputs, state) From d04e4e0d222c6d36f0fad660952d5a2a2a8d0979 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sun, 7 Oct 2018 13:27:56 -0400 Subject: [PATCH 12/65] add learning rate --- examples/differentiable_expected_bleu/config_train.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py index 2cbe8dd7..563156db 100644 --- a/examples/differentiable_expected_bleu/config_train.py +++ b/examples/differentiable_expected_bleu/config_train.py @@ -7,6 +7,9 @@ train_xe = { "optimizer": { "type": "AdamOptimizer", + "kwargs": { + "learning_rate": 1e-5, + } }, "gradient_clip": { "type": "clip_by_global_norm", @@ -19,6 +22,9 @@ train_debleu = { "optimizer": { "type": "AdamOptimizer", + "kwargs": { + "learning_rate": 1e-5, + } }, "gradient_clip": { "type": "clip_by_global_norm", From 3f74126a05d5e4bcc77c64b6c7b38b4548c3e3b7 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Mon, 8 Oct 2018 00:22:34 -0400 Subject: [PATCH 13/65] add mask summary ; fix action --- .../differentiable_expected_bleu.py | 120 +++++++++++------- texar/modules/decoders/rnn_decoder_helpers.py | 8 ++ 2 files changed, 84 insertions(+), 44 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index e1db02a6..8750f059 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -40,8 +40,24 @@ config_data = importlib.import_module(FLAGS.config_data) pretraining = FLAGS.pretraining +expr_name = config_train.expr_name mask_patterns = config_train.mask_patterns +def optimistic_restore(session, save_file, graph=tf.get_default_graph()): + reader = tf.train.NewCheckpointReader(save_file) + saved_shapes = reader.get_variable_to_shape_map() + var_names = sorted([ + (var.name, var.name.split(':')[0]) for var in tf.global_variables() + if var.name.split(':')[0] in saved_shapes]) + restore_vars = [] + for var_name, saved_var_name in var_names: + curr_var = graph.get_tensor_by_name(var_name) + var_shape = curr_var.get_shape().as_list() + if var_shape == saved_shapes[saved_var_name]: + restore_vars.append(curr_var) + opt_saver = tf.train.Saver(restore_vars) + opt_saver.restore(session, save_file) + def get_data_loader(sess, fetches, feed_dict): while True: try: @@ -69,39 +85,48 @@ def build_model(batch, train_data): vocab_size=train_data.target_vocab.size, hparams=config_model.decoder) - # cross-entropy + teacher-forcing pretraining - tf_outputs, _, _ = decoder( - decoding_strategy='train_greedy', - inputs=target_embedder(batch['target_text_ids'][:, :-1]), - sequence_length=batch['target_length']-1) - - train_xe_op = tx.core.get_train_op( - tx.losses.sequence_sparse_softmax_cross_entropy( - labels=batch['target_text_ids'][:, 1:], - logits=tf_outputs.logits, - sequence_length=batch['target_length']-1), - hparams=config_train.train_xe) - - # teacher mask + DEBLEU fine-tuning - tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper( - # must not remove last token, since it may be used as mask - inputs=batch['target_text_ids'], - sequence_length=batch['target_length']-1, - embedding=target_embedder, - n_unmask=mask_patterns[0][0], - n_mask=mask_patterns[0][1], - tau=config_train.tau) - - tm_outputs, _, _ = decoder( - helper=tm_helper) - - train_debleu_op = tx.core.get_train_op( - tx.losses.differentiable_expected_bleu( - #TODO: decide whether to include BOS - labels=batch['target_text_ids'][:, 1:], - probs=tm_outputs.sample_id, - sequence_length=batch['target_length']-1), - hparams=config_train.train_debleu) + if pretraining: + # cross-entropy + teacher-forcing pretraining + tf_outputs, _, _ = decoder( + decoding_strategy='train_greedy', + inputs=target_embedder(batch['target_text_ids'][:, :-1]), + sequence_length=batch['target_length']-1) + + train_xe_op = tx.core.get_train_op( + tx.losses.sequence_sparse_softmax_cross_entropy( + labels=batch['target_text_ids'][:, 1:], + logits=tf_outputs.logits, + sequence_length=batch['target_length']-1), + hparams=config_train.train_xe) + else: + train_xe_op = None + + if not pretraining: + # teacher mask + DEBLEU fine-tuning + tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper( + # must not remove last token, since it may be used as mask + inputs=batch['target_text_ids'], + sequence_length=batch['target_length']-1, + embedding=target_embedder, + n_unmask=mask_patterns[0][0], + n_mask=mask_patterns[0][1], + tau=config_train.tau) + tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask) + tf.summary.scalar('tm/n_mask', tm_helper.n_mask) + + tm_outputs, _, _ = decoder( + helper=tm_helper) + + train_debleu_op = tx.core.get_train_op( + tx.losses.differentiable_expected_bleu( + #TODO: decide whether to include BOS + labels=batch['target_text_ids'][:, 1:], + probs=tm_outputs.sample_id, + sequence_length=batch['target_length']-1), + hparams=config_train.train_debleu) + else: + tm_helper = None + train_debleu_op = None # inference: beam search decoding start_tokens = tf.ones_like(batch['target_length']) * \ @@ -141,6 +166,7 @@ def main(): saver = tf.train.Saver(max_to_keep=None) def _train_epoch(sess, summary_writer, train_op, trigger): + print('in _train_epoch') data_iterator.restart_dataset(sess, 'train') feed_dict = { tx.global_mode(): tf.estimator.ModeKeys.TRAIN, @@ -153,7 +179,10 @@ def _train_epoch(sess, summary_writer, train_op, trigger): if step % config_train.steps_per_eval == 0: _eval_epoch(sess, summary_writer, 'val', trigger) + print('end _train_epoch') + def _eval_epoch(sess, summary_writer, mode, trigger): + print('in _eval_epoch with mode {}'.format(mode)) data_iterator.restart_dataset(sess, mode) feed_dict = { tx.global_mode(): tf.estimator.ModeKeys.EVAL, @@ -182,34 +211,36 @@ def _eval_epoch(sess, summary_writer, mode, trigger): summary = tf.Summary() summary.value.add(tag='{}/BLEU'.format(mode), simple_value=bleu) summary_writer.add_summary(summary, step) + summary_writer.flush() if trigger is not None: triggered, _ = trigger(step, bleu) if triggered: print('triggered!') + print('end _eval_epoch') return bleu best_val_bleu = -1 with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) - ckpt_name = 'ckpt/model.ckpt' - if os.path.exists('ckpt') and tf.train.checkpoint_exists(ckpt_name): + ckpt_path = os.path.join(expr_name, 'ckpt') + ckpt_name = os.path.join(ckpt_path, 'model.ckpt') + if os.path.exists(ckpt_path) and tf.train.checkpoint_exists(ckpt_name): print('restoring from {} ...'.format(ckpt_name)) - saver.restore(sess, ckpt_name) - else: - sess.run(tf.global_variables_initializer()) - sess.run(tf.local_variables_initializer()) + optimistic_restore(sess, ckpt_name) + print('done.') - summary_writer = tf.summary.FileWriter('log', sess.graph) + summary_writer = tf.summary.FileWriter( + os.path.join(expr_name, 'log'), sess.graph, flush_secs=30) if pretraining: trigger = None else: - action = map( - lambda pattern: tm_helper.assign_mask_pattern( - sess, pattern[0], pattern[1]), - mask_patterns[1:]) + action = (tm_helper.assign_mask_pattern(sess, n_unmask, n_mask) + for n_unmask, n_mask in mask_patterns[1:]) trigger = BestEverConvergenceTrigger( action, config_train.threshold_steps, @@ -218,6 +249,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger): epoch = 0 while epoch < config_train.max_epochs: + print('epoch #{}:'.format(epoch)) val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger) if val_bleu > best_val_bleu: best_val_bleu = val_bleu diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py index a9e2bd1c..bfe7cb99 100644 --- a/texar/modules/decoders/rnn_decoder_helpers.py +++ b/texar/modules/decoders/rnn_decoder_helpers.py @@ -366,6 +366,14 @@ def sample_ids_dtype(self): def sample_ids_shape(self): return self._embedding.get_shape()[:1] + @property + def n_unmask(self): + return self._n_unmask + + @property + def n_mask(self): + return self._n_mask + def assign_mask_pattern(self, sess, n_unmask, n_mask): sess.run([self._assign_n_unmask, self._assign_n_mask], feed_dict={self._new_n_unmask: n_unmask, From 69129f4bac5d553a657de8156bebba3048d3ce7c Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Mon, 8 Oct 2018 14:25:25 -0400 Subject: [PATCH 14/65] fix random shift bug --- examples/differentiable_expected_bleu/config_train.py | 2 ++ texar/modules/decoders/rnn_decoder_helpers.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py index 890781c1..077f54f9 100644 --- a/examples/differentiable_expected_bleu/config_train.py +++ b/examples/differentiable_expected_bleu/config_train.py @@ -37,3 +37,5 @@ }, }, } + +expr_name = 'xe_1e3_debleu_1e5' diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py index bfe7cb99..c12447d7 100644 --- a/texar/modules/decoders/rnn_decoder_helpers.py +++ b/texar/modules/decoders/rnn_decoder_helpers.py @@ -380,7 +380,7 @@ def assign_mask_pattern(self, sess, n_unmask, n_mask): self._new_n_mask: n_mask}) def _is_masked(self, time): - return time % self._n_cycle < self._n_mask + return (time + self._n_shift) % self._n_cycle < self._n_mask def initialize(self, name=None): finished = tf.equal(0, self._sequence_length) From 142665f6137bc2f7b7b6656aa274c371b7512071 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Mon, 8 Oct 2018 15:33:33 -0400 Subject: [PATCH 15/65] don't restore Adam status --- .../differentiable_expected_bleu.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 8750f059..f7948cf9 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -55,6 +55,10 @@ def optimistic_restore(session, save_file, graph=tf.get_default_graph()): var_shape = curr_var.get_shape().as_list() if var_shape == saved_shapes[saved_var_name]: restore_vars.append(curr_var) + restore_vars = list(filter( + lambda var: var.name.split('/')[0] != 'OptimizeLoss', restore_vars)) + print('restoring variables:\n{}'.format('\n'.join( + var.name for var in restore_vars))) opt_saver = tf.train.Saver(restore_vars) opt_saver.restore(session, save_file) @@ -124,6 +128,7 @@ def build_model(batch, train_data): probs=tm_outputs.sample_id, sequence_length=batch['target_length']-1), hparams=config_train.train_debleu) + else: tm_helper = None train_debleu_op = None From c836e538afc71ddf3506384e24728d27faed8187 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Mon, 8 Oct 2018 18:36:50 -0400 Subject: [PATCH 16/65] fix save path --- .../differentiable_expected_bleu.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index f7948cf9..7e3f6fde 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -168,7 +168,7 @@ def main(): merged_summary = tf.summary.merge_all() - saver = tf.train.Saver(max_to_keep=None) + saver = tf.train.Saver(max_to_keep=0) def _train_epoch(sess, summary_writer, train_op, trigger): print('in _train_epoch') @@ -232,10 +232,11 @@ def _eval_epoch(sess, summary_writer, mode, trigger): sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) ckpt_path = os.path.join(expr_name, 'ckpt') - ckpt_name = os.path.join(ckpt_path, 'model.ckpt') - if os.path.exists(ckpt_path) and tf.train.checkpoint_exists(ckpt_name): - print('restoring from {} ...'.format(ckpt_name)) - optimistic_restore(sess, ckpt_name) + ckpt_model = os.path.join(ckpt_path, 'model.ckpt') + ckpt_best = os.path.join(ckpt_path, 'best.ckpt') + if os.path.exists(ckpt_path) and tf.train.checkpoint_exists(ckpt_model): + print('restoring from {} ...'.format(ckpt_model)) + optimistic_restore(sess, ckpt_model) print('done.') summary_writer = tf.summary.FileWriter( @@ -262,11 +263,11 @@ def _eval_epoch(sess, summary_writer, mode, trigger): epoch, tf.train.global_step(sess, global_step), best_val_bleu)) - saved_path = saver.save(sess, 'ckpt/best.ckpt') + saved_path = saver.save(sess, os.path.join(ckpt_best)) print('saved to {}'.format(saved_path)) _train_epoch(sess, summary_writer, train_op, trigger) epoch += 1 - saved_path = saver.save(sess, 'ckpt/model.ckpt') + saved_path = saver.save(sess, ckpt_model) print('saved to {}'.format(saved_path)) From ffe568ddbe0ce1527275fe3bfb21ff9794ed116a Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Mon, 8 Oct 2018 18:43:55 -0400 Subject: [PATCH 17/65] add flags.restore_adam --- .../differentiable_expected_bleu.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 7e3f6fde..a31e5632 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -31,7 +31,8 @@ flags.DEFINE_string("config_train", "config_train", "The training config.") flags.DEFINE_string("config_model", "config_model", "The model config.") flags.DEFINE_string("config_data", "config_iwslt14", "The dataset config.") -flags.DEFINE_boolean("pretraining", False, "whether pretraining") +flags.DEFINE_boolean("pretraining", False, "Whether pretraining.") +flags.DEFINE_boolean("restore_adam", False, "Whether to restore Adam states.") FLAGS = flags.FLAGS @@ -39,6 +40,7 @@ config_model = importlib.import_module(FLAGS.config_model) config_data = importlib.import_module(FLAGS.config_data) pretraining = FLAGS.pretraining +restore_adam = FLAGS.restore_adam expr_name = config_train.expr_name mask_patterns = config_train.mask_patterns @@ -55,8 +57,9 @@ def optimistic_restore(session, save_file, graph=tf.get_default_graph()): var_shape = curr_var.get_shape().as_list() if var_shape == saved_shapes[saved_var_name]: restore_vars.append(curr_var) - restore_vars = list(filter( - lambda var: var.name.split('/')[0] != 'OptimizeLoss', restore_vars)) + if not restore_adam: + restore_vars = list(filter( + lambda var: var.name.split('/')[0] != 'OptimizeLoss', restore_vars)) print('restoring variables:\n{}'.format('\n'.join( var.name for var in restore_vars))) opt_saver = tf.train.Saver(restore_vars) From 73d1c7b88b47967c4731627012a7f88b8efbec53 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Mon, 8 Oct 2018 19:49:46 -0400 Subject: [PATCH 18/65] add global_step onto saved ckpt --- .../differentiable_expected_bleu.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index a31e5632..4d6c12d6 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -171,7 +171,7 @@ def main(): merged_summary = tf.summary.merge_all() - saver = tf.train.Saver(max_to_keep=0) + saver = tf.train.Saver(max_to_keep=None) def _train_epoch(sess, summary_writer, train_op, trigger): print('in _train_epoch') @@ -262,15 +262,16 @@ def _eval_epoch(sess, summary_writer, mode, trigger): val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger) if val_bleu > best_val_bleu: best_val_bleu = val_bleu + step = tf.train.global_step(sess, global_step) print('epoch: {}, step: {}, best val bleu: {}'.format( - epoch, - tf.train.global_step(sess, global_step), - best_val_bleu)) - saved_path = saver.save(sess, os.path.join(ckpt_best)) + epoch, step, best_val_bleu)) + saved_path = saver.save( + sess, os.path.join(ckpt_best), global_step=step) print('saved to {}'.format(saved_path)) _train_epoch(sess, summary_writer, train_op, trigger) epoch += 1 - saved_path = saver.save(sess, ckpt_model) + step = tf.train.global_step(sess, global_step) + saved_path = saver.save(sess, ckpt_model, global_step=step) print('saved to {}'.format(saved_path)) From bf92f2ee2d08fa052b05b08a9fae04d1661e8c12 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Wed, 10 Oct 2018 18:15:16 -0400 Subject: [PATCH 19/65] add flags.restore_mask --- ...g_train.py => config_en-fr_xe_1e3_xe_1e5_debleu.py} | 2 +- .../differentiable_expected_bleu.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) rename examples/differentiable_expected_bleu/{config_train.py => config_en-fr_xe_1e3_xe_1e5_debleu.py} (94%) diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_en-fr_xe_1e3_xe_1e5_debleu.py similarity index 94% rename from examples/differentiable_expected_bleu/config_train.py rename to examples/differentiable_expected_bleu/config_en-fr_xe_1e3_xe_1e5_debleu.py index 077f54f9..07acbea8 100644 --- a/examples/differentiable_expected_bleu/config_train.py +++ b/examples/differentiable_expected_bleu/config_en-fr_xe_1e3_xe_1e5_debleu.py @@ -38,4 +38,4 @@ }, } -expr_name = 'xe_1e3_debleu_1e5' +expr_name = 'en-fr_xe_1e3_xe_1e5_debleu' diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 4d6c12d6..7a3c1162 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -33,6 +33,7 @@ flags.DEFINE_string("config_data", "config_iwslt14", "The dataset config.") flags.DEFINE_boolean("pretraining", False, "Whether pretraining.") flags.DEFINE_boolean("restore_adam", False, "Whether to restore Adam states.") +flags.DEFINE_boolean("restore_mask", False, "Whether to restore mask patterns.") FLAGS = flags.FLAGS @@ -41,6 +42,7 @@ config_data = importlib.import_module(FLAGS.config_data) pretraining = FLAGS.pretraining restore_adam = FLAGS.restore_adam +restore_mask = FLAGS.restore_mask expr_name = config_train.expr_name mask_patterns = config_train.mask_patterns @@ -59,7 +61,13 @@ def optimistic_restore(session, save_file, graph=tf.get_default_graph()): restore_vars.append(curr_var) if not restore_adam: restore_vars = list(filter( - lambda var: var.name.split('/')[0] != 'OptimizeLoss', restore_vars)) + lambda var: var.name.split(':')[0].split('/')[0] != 'OptimizeLoss', + restore_vars)) + if not restore_mask: + restore_vars = list(filter( + lambda var: var.name.split(':')[0].split('/')[0] not in \ + ['n_unmask', 'n_mask'], + restore_vars)) print('restoring variables:\n{}'.format('\n'.join( var.name for var in restore_vars))) opt_saver = tf.train.Saver(restore_vars) From 9fe74cb2e80e88d23540d16099fc3e6c6d6bb139 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sat, 13 Oct 2018 14:37:33 -0400 Subject: [PATCH 20/65] remove config_model_full.py ; rename debleu ; rename some arguments ; build entire model together --- .../config_model_full.py | 127 --------------- .../differentiable_expected_bleu.py | 82 +++++----- texar/losses/__init__.py | 2 +- ...ferentiable_expected_bleu.py => debleu.py} | 153 +++++++++--------- 4 files changed, 114 insertions(+), 250 deletions(-) delete mode 100644 examples/differentiable_expected_bleu/config_model_full.py rename texar/losses/{differentiable_expected_bleu.py => debleu.py} (55%) diff --git a/examples/differentiable_expected_bleu/config_model_full.py b/examples/differentiable_expected_bleu/config_model_full.py deleted file mode 100644 index b59ebc4e..00000000 --- a/examples/differentiable_expected_bleu/config_model_full.py +++ /dev/null @@ -1,127 +0,0 @@ -# The full possible hyperparameters for the attentional seq2seq model. -# Most of the hyperparameters take the default values and are not necessary to -# specify explicitly. The config here results in the same model with the -# `config_model.py`. - -num_units = 256 -beam_width = 10 - -# --------------------- Embedder --------------------- # -embedder = { - 'dim': num_units, - 'initializer': { - 'type': 'random_uniform_initializer', - 'kwargs': { - 'minval': -0.1, - 'maxval': 0.1, - 'seed': None - }, - }, - 'regularizer': { - 'type': 'L1L2', - 'kwargs': { - 'l1': 0, - 'l2': 0 - } - }, - 'dropout_rate': 0, - 'dropout_strategy': 'element', - 'trainable': True, - 'name': 'word_embedder' -} - -# --------------------- Encoder --------------------- # -encoder = { - 'rnn_cell_fw': { - 'type': 'LSTMCell', - 'kwargs': { - 'num_units': num_units, - 'forget_bias': 1.0, - 'activation': None, - # Other arguments go here for tf.nn.rnn_cell.LSTMCell - # ... - }, - 'num_layers': 1, - 'dropout': { - 'input_keep_prob': 1.0, - 'output_keep_prob': 1.0, - 'state_keep_prob': 1.0, - 'variational_recurrent': False, - 'input_size': [], - }, - 'residual': False, - 'highway': False, - }, - 'rnn_cell_bw': { - # The same possible hyperparameters as with 'rnn_cell_fw' - # ... - }, - 'rnn_cell_share_config': True, - 'output_layer_fw': { - 'num_layers': 0, - 'layer_size': 128, - 'activation': 'identity', - 'final_layer_activation': None, - 'other_dense_kwargs': None, - 'dropout_layer_ids': [], - 'dropout_rate': 0.5, - 'variational_dropout': False - }, - 'output_layer_bw': { - # The same possible hyperparameters as with 'output_layer_fw' - # ... - }, - 'output_layer_share_config': True, - 'name': 'bidirectional_rnn_encoder' -} - -# --------------------- Decoder --------------------- # -decoder = { - 'rnn_cell': { - 'type': 'LSTMCell', - 'kwargs': { - 'num_units': num_units, - 'forget_bias': 1.0, - 'activation': None, - # Other arguments go here for tf.nn.rnn_cell.LSTMCell - # ... - }, - 'num_layers': 1, - 'dropout': { - 'input_keep_prob': 1.0, - 'output_keep_prob': 1.0, - 'state_keep_prob': 1.0, - 'variational_recurrent': False, - 'input_size': [], - }, - 'residual': False, - 'highway': False, - }, - 'attention': { - 'type': 'LuongAttention', - 'kwargs': { - 'num_units': num_units, - 'scale': False, - 'probability_fn': None, - 'score_mask_value': None, - # Other arguments go here for tf.contrib.seq2seq.LuongAttention - # ... - }, - 'attention_layer_size': num_units, - 'alignment_history': False, - 'output_attention': True, - }, - 'helper_train': { - 'type': 'TrainingHelper', - 'kwargs': { - # Arguments go here for tf.contrib.seq2seq.TrainingHelper - } - }, - 'helper_infer': { - # The same possible hyperparameters as with 'helper_train' - # ... - }, - 'max_decoding_length_train': None, - 'max_decoding_length_infer': None, - 'name': 'attention_rnn_decoder' -} diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 7a3c1162..b8dcaa8d 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -100,49 +100,42 @@ def build_model(batch, train_data): vocab_size=train_data.target_vocab.size, hparams=config_model.decoder) - if pretraining: - # cross-entropy + teacher-forcing pretraining - tf_outputs, _, _ = decoder( - decoding_strategy='train_greedy', - inputs=target_embedder(batch['target_text_ids'][:, :-1]), - sequence_length=batch['target_length']-1) - - train_xe_op = tx.core.get_train_op( - tx.losses.sequence_sparse_softmax_cross_entropy( - labels=batch['target_text_ids'][:, 1:], - logits=tf_outputs.logits, - sequence_length=batch['target_length']-1), - hparams=config_train.train_xe) - else: - train_xe_op = None - - if not pretraining: - # teacher mask + DEBLEU fine-tuning - tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper( - # must not remove last token, since it may be used as mask - inputs=batch['target_text_ids'], - sequence_length=batch['target_length']-1, - embedding=target_embedder, - n_unmask=mask_patterns[0][0], - n_mask=mask_patterns[0][1], - tau=config_train.tau) - tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask) - tf.summary.scalar('tm/n_mask', tm_helper.n_mask) - - tm_outputs, _, _ = decoder( - helper=tm_helper) - - train_debleu_op = tx.core.get_train_op( - tx.losses.differentiable_expected_bleu( - #TODO: decide whether to include BOS - labels=batch['target_text_ids'][:, 1:], - probs=tm_outputs.sample_id, - sequence_length=batch['target_length']-1), - hparams=config_train.train_debleu) - - else: - tm_helper = None - train_debleu_op = None + # cross-entropy + teacher-forcing pretraining + tf_outputs, _, _ = decoder( + decoding_strategy='train_greedy', + inputs=target_embedder(batch['target_text_ids'][:, :-1]), + sequence_length=batch['target_length']-1) + + loss_xe = tx.losses.sequence_sparse_softmax_cross_entropy( + labels=batch['target_text_ids'][:, 1:], + logits=tf_outputs.logits, + sequence_length=batch['target_length']-1) + + train_xe_op = tx.core.get_train_op( + loss_xe, + hparams=config_train.train_xe) + + # teacher mask + DEBLEU fine-tuning + tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper( + # must not remove last token, since it may be used as mask + inputs=batch['target_text_ids'], + sequence_length=batch['target_length']-1, + embedding=target_embedder, + n_unmask=mask_patterns[0][0], + n_mask=mask_patterns[0][1], + tau=config_train.tau) + + tm_outputs, _, _ = decoder( + helper=tm_helper) + + loss_debleu = tx.losses.debleu( + labels=batch['target_text_ids'][:, 1:], + probs=tm_outputs.sample_id, + sequence_length=batch['target_length']-1) + + train_debleu_op = tx.core.get_train_op( + loss_debleu, + hparams=config_train.train_debleu) # inference: beam search decoding start_tokens = tf.ones_like(batch['target_length']) * \ @@ -177,6 +170,9 @@ def main(): build_model(data_batch, train_data) train_op = train_xe_op if pretraining else train_debleu_op + tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask) + tf.summary.scalar('tm/n_mask', tm_helper.n_mask) + merged_summary = tf.summary.merge_all() saver = tf.train.Saver(max_to_keep=None) diff --git a/texar/losses/__init__.py b/texar/losses/__init__.py index 48586d40..c8d09cfc 100644 --- a/texar/losses/__init__.py +++ b/texar/losses/__init__.py @@ -27,4 +27,4 @@ from texar.losses.adv_losses import * from texar.losses.rewards import * from texar.losses.entropy import * -from texar.losses.differentiable_expected_bleu import * +from texar.losses.debleu import * diff --git a/texar/losses/differentiable_expected_bleu.py b/texar/losses/debleu.py similarity index 55% rename from texar/losses/differentiable_expected_bleu.py rename to texar/losses/debleu.py index 1f3cc1f7..eeb9ba04 100644 --- a/texar/losses/differentiable_expected_bleu.py +++ b/texar/losses/debleu.py @@ -25,85 +25,80 @@ # pylint: disable=too-many-arguments __all__ = [ - "differentiable_expected_bleu", + "debleu", ] def batch_gather(params, indices, name=None): - """This function is copied and modified from tensorflow 11.0. - Gather slices from `params` according to `indices` with leading batch dims. - This operation assumes that the leading dimensions of `indices` are dense, - and the gathers on the axis corresponding to the last dimension of `indices`. - More concretely it computes: - result[i1, ..., in] = params[i1, ..., in-1, indices[i1, ..., in]] - Therefore `params` should be a Tensor of shape [A1, ..., AN, B1, ..., BM], - `indices` should be a Tensor of shape [A1, ..., AN-1, C] and `result` will be - a Tensor of size `[A1, ..., AN-1, C, B1, ..., BM]`. - In the case in which indices is a 1D tensor, this operation is equivalent to - `tf.gather`. - See also `tf.gather` and `tf.gather_nd`. - Args: - params: A Tensor. The tensor from which to gather values. - indices: A Tensor. Must be one of the following types: int32, int64. Index - tensor. Must be in range `[0, params.shape[axis]`, where `axis` is the - last dimension of `indices` itself. - name: A name for the operation (optional). - Returns: - A Tensor. Has the same type as `params`. - Raises: - ValueError: if `indices` has an unknown shape. - """ - - with tf.name_scope(name): - indices = tf.convert_to_tensor(indices, name="indices") - params = tf.convert_to_tensor(params, name="params") - indices_shape = tf.shape(indices) - params_shape = tf.shape(params) - - ndims = indices.shape.ndims - if ndims is None: - raise ValueError("batch_gather does not allow indices with unknown " - "shape.") - batch_indices = indices - indices_dtype = indices.dtype.base_dtype - accum_dim_value = tf.ones((), dtype=indices_dtype) - # Use correct type for offset index computation - casted_params_shape = tf.cast(params_shape, indices_dtype) - for dim in range(ndims-1, 0, -1): - dim_value = casted_params_shape[dim-1] - accum_dim_value *= casted_params_shape[dim] - start = tf.zeros((), dtype=indices_dtype) - step = tf.ones((), dtype=indices_dtype) - dim_indices = tf.range(start, dim_value, step) - dim_indices *= accum_dim_value - dim_shape = tf.stack([1] * (dim - 1) + [dim_value] + [1] * (ndims - dim), - axis=0) - batch_indices += tf.reshape(dim_indices, dim_shape) - - flat_indices = tf.reshape(batch_indices, [-1]) - outer_shape = params_shape[ndims:] - flat_inner_shape = tf.reduce_prod(params_shape[:ndims]) - - flat_params = tf.reshape( - params, tf.concat([[flat_inner_shape], outer_shape], axis=0)) - flat_result = tf.gather(flat_params, flat_indices) - result = tf.reshape( - flat_result, tf.concat([indices_shape, outer_shape], axis=0)) - final_shape = indices.get_shape()[:ndims-1].merge_with( - params.get_shape()[:ndims -1]) - final_shape = final_shape.concatenate(indices.get_shape()[ndims-1]) - final_shape = final_shape.concatenate(params.get_shape()[ndims:]) - result.set_shape(final_shape) - return result - -def differentiable_expected_bleu(labels, - probs, - sequence_length, - time_major=False, - min_fn=lambda x: tf.minimum(1., x), - max_order=4, - weights=[.1, .3, .3, .3], - smooth_add=1e-9, - name=None): + """This function is copied and modified from tensorflow 11.0. See + https://www.tensorflow.org/api_docs/python/tf/batch_gather for details. + Gather slices from `params` according to `indices` with leading batch dims. + This operation assumes that the leading dimensions of `indices` are dense, + and the gathers on the axis corresponding to the last dimension of `indices`. + More concretely it computes: + result[i1, ..., in] = params[i1, ..., in-1, indices[i1, ..., in]] + Therefore `params` should be a Tensor of shape [A1, ..., AN, B1, ..., BM], + `indices` should be a Tensor of shape [A1, ..., AN-1, C] and `result` will be + a Tensor of size `[A1, ..., AN-1, C, B1, ..., BM]`. + In the case in which indices is a 1D tensor, this operation is equivalent to + `tf.gather`. + See also `tf.gather` and `tf.gather_nd`. + Args: + params: A Tensor. The tensor from which to gather values. + indices: A Tensor. Must be one of the following types: int32, int64. Index + tensor. Must be in range `[0, params.shape[axis]`, where `axis` is the + last dimension of `indices` itself. + name: A name for the operation (optional). + Returns: + A Tensor. Has the same type as `params`. + Raises: + ValueError: if `indices` has an unknown shape. + """ + + with tf.name_scope(name): + indices = tf.convert_to_tensor(indices, name="indices") + params = tf.convert_to_tensor(params, name="params") + indices_shape = tf.shape(indices) + params_shape = tf.shape(params) + + ndims = indices.shape.ndims + if ndims is None: + raise ValueError("batch_gather does not allow indices with unknown " + "shape.") + batch_indices = indices + indices_dtype = indices.dtype.base_dtype + accum_dim_value = tf.ones((), dtype=indices_dtype) + # Use correct type for offset index computation + casted_params_shape = tf.cast(params_shape, indices_dtype) + for dim in range(ndims-1, 0, -1): + dim_value = casted_params_shape[dim-1] + accum_dim_value *= casted_params_shape[dim] + start = tf.zeros((), dtype=indices_dtype) + step = tf.ones((), dtype=indices_dtype) + dim_indices = tf.range(start, dim_value, step) + dim_indices *= accum_dim_value + dim_shape = tf.stack( + [1] * (dim - 1) + [dim_value] + [1] * (ndims - dim), axis=0) + batch_indices += tf.reshape(dim_indices, dim_shape) + + flat_indices = tf.reshape(batch_indices, [-1]) + outer_shape = params_shape[ndims:] + flat_inner_shape = tf.reduce_prod(params_shape[:ndims]) + + flat_params = tf.reshape( + params, tf.concat([[flat_inner_shape], outer_shape], axis=0)) + flat_result = tf.gather(flat_params, flat_indices) + result = tf.reshape( + flat_result, tf.concat([indices_shape, outer_shape], axis=0)) + final_shape = indices.get_shape()[:ndims-1].merge_with( + params.get_shape()[:ndims -1]) + final_shape = final_shape.concatenate(indices.get_shape()[ndims-1]) + final_shape = final_shape.concatenate(params.get_shape()[ndims:]) + result.set_shape(final_shape) + return result + +def debleu(labels, probs, sequence_length, time_major=False, + min_fn=lambda x: tf.minimum(1., x), max_order=4, + weights=[.1, .3, .3, .3], epsilon=1e-9, name=None): """Computes sparse softmax cross entropy for each time step of sequence predictions. @@ -148,7 +143,7 @@ def differentiable_expected_bleu(labels, sequence_length=data_batch['length']-1) """ # TODO: rewrite example - with tf.name_scope(name, "sequence_sparse_softmax_cross_entropy"): + with tf.name_scope(name, "debleu"): X = probs Y = labels @@ -184,7 +179,7 @@ def differentiable_expected_bleu(labels, min_fn(cntYY / (cntYX - matchXY + 1)) * matchXY / tf.maximum(1., cntYY), 2), 1) - # in order to avoid dividing 0 + # in order to avoid being divided by 0 tot_order = tf.maximum(1, sequence_length - order) tot.append(tot_order) o.append(o_order) @@ -192,7 +187,7 @@ def differentiable_expected_bleu(labels, tot = tf.stack(tot, 1) o = tf.stack(o, 1) prec = tf.reduce_sum(o, 0) / tf.to_float(tf.reduce_sum(tot, 0)) - neglog_prec = -tf.log(prec + smooth_add) + neglog_prec = -tf.log(prec + epsilon) loss = tf.reduce_sum(weights * neglog_prec, 0) return loss From 9dcde6a97da0266f1e31cf6be3035e68902b0051 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sat, 13 Oct 2018 16:21:00 -0400 Subject: [PATCH 21/65] fix checkpoint save and restore bug --- .../differentiable_expected_bleu.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index b8dcaa8d..a3b071fa 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -238,12 +238,14 @@ def _eval_epoch(sess, summary_writer, mode, trigger): sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) - ckpt_path = os.path.join(expr_name, 'ckpt') - ckpt_model = os.path.join(ckpt_path, 'model.ckpt') - ckpt_best = os.path.join(ckpt_path, 'best.ckpt') - if os.path.exists(ckpt_path) and tf.train.checkpoint_exists(ckpt_model): - print('restoring from {} ...'.format(ckpt_model)) - optimistic_restore(sess, ckpt_model) + dir_model = os.path.join(expr_name, 'ckpt') + dir_best = os.path.join(expr_name, 'ckpt-best') + ckpt_model = os.path.join(dir_model, 'model.ckpt') + ckpt_best = os.path.join(dir_best, 'model.ckpt') + if os.path.exists(dir_model): + ckpt_path = tf.train.latest_checkpoint(dir_model) + print('restoring from {} ...'.format(ckpt_path)) + optimistic_restore(sess, ckpt_path) print('done.') summary_writer = tf.summary.FileWriter( From 038478e8f0829115581eca44ae00db929a839ace Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sat, 13 Oct 2018 17:25:56 -0400 Subject: [PATCH 22/65] refine trigger --- .../config_train.py | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 examples/differentiable_expected_bleu/config_train.py diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py new file mode 100644 index 00000000..fe175a22 --- /dev/null +++ b/examples/differentiable_expected_bleu/config_train.py @@ -0,0 +1,45 @@ +max_epochs = 1000 +steps_per_eval = 500 +tau = 1. +infer_beam_width = 1 +infer_max_decoding_length = 50 + +mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] +threshold_steps = 10000 +minimum_interval_steps = 10000 + +train_xe = { + "optimizer": { + "type": "AdamOptimizer", + }, + "learning_rate_decay": { + "type": "piecewise_constant", + "kwargs": { + "boundaries": [160000], + "values": [1e-3, 1e-5], + }, + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, +} + +train_debleu = { + "optimizer": { + "type": "AdamOptimizer", + "kwargs": { + "learning_rate": 1e-5, + } + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, +} + +expr_name = 'train' From 101d5a194bee1f43ef2844e71311f3f20e6ee14c Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sat, 13 Oct 2018 19:19:37 -0400 Subject: [PATCH 23/65] refine trigger --- .../differentiable_expected_bleu.py | 2 +- .../differentiable_expected_bleu/triggers.py | 28 +++++++++++-------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index a3b071fa..d5db0664 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -259,7 +259,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger): trigger = BestEverConvergenceTrigger( action, config_train.threshold_steps, - config_train.wait_steps, + config_train.minimum_interval_steps, default=None) epoch = 0 diff --git a/examples/differentiable_expected_bleu/triggers.py b/examples/differentiable_expected_bleu/triggers.py index 9b879048..65efe8ca 100644 --- a/examples/differentiable_expected_bleu/triggers.py +++ b/examples/differentiable_expected_bleu/triggers.py @@ -34,16 +34,16 @@ def __init__(self, action, default=DEFAULT): return result values. default is used as result value when action is exhausted. """ - self._action = action + self._action = iter(action) self._default = default - def predicate(self, *args, **kwargs): + def _predicate(self, *args, **kwargs): """This function returns True when we think we should do something. """ raise NotImplementedError def __call__(self, *args, **kwargs): - pred = self.predicate(*args, **kwargs) + pred = self._predicate(*args, **kwargs) if pred: ret = next(self._action) if self._default is DEFAULT else \ next(self._action, self._default) @@ -64,7 +64,7 @@ def __init__(self, action, steps, default=DEFAULT): def _advance_steps(self): self._next_step = next(step, None) - def predicate(self, step): + def _predicate(self, step): while self._next_step is not None and step < self._next_step: self._advance_steps() if self._next_step is not None and step == self._next_step: @@ -74,21 +74,23 @@ def predicate(self, step): class BestEverConvergenceTrigger(Trigger): - def __init__(self, action, threshold_steps, wait_steps, default=DEFAULT): + def __init__(self, action, threshold_steps, minimum_interval_steps, + default=DEFAULT): super(BestEverConvergenceTrigger, self).__init__(action, default) self._threshold_steps = threshold_steps - self._wait_steps = wait_steps + self._minimum_interval_steps = minimum_interval_steps self._last_triggered_step = None self._best_ever_step = None self._best_ever_score = None - def predicate(self, step, score): + def _predicate(self, step, score): if self._best_ever_score is None or self._best_ever_score < score: self._best_ever_score = score self._best_ever_step = step if (self._last_triggered_step is None or - step - self._last_triggered_step >= self._wait_steps) and \ + step - self._last_triggered_step >= + self._minimum_interval_steps) and \ step - self._best_ever_step >= self._threshold_steps: self._last_triggered_step = step return True @@ -97,18 +99,19 @@ def predicate(self, step, score): class MovingAverageConvergenceTrigger(Trigger): - def __init__(self, action, n, threshold, wait_steps, default=DEFAULT): + def __init__(self, action, n, threshold, minimum_interval_steps, + default=DEFAULT): super(MovingAverageConvergenceTrigger, self).__init__(action, default) self._n = n self._threshold = threshold - self._wait_steps = wait_steps + self._minimum_interval_steps = minimum_interval_steps self._last_triggered_step = None self._head_queue = queue.Queue(self._n) self._head_sum = 0 self._rear_queue = queue.Queue(self._n) self._rear_sum = 0 - def predicate(self, step, score): + def _predicate(self, step, score): if self._head_queue.full(): e = self._head_queue.get() self._head_sum -= e @@ -120,7 +123,8 @@ def predicate(self, step, score): self._head_sum += score if (self._last_triggered_step is None or - step - self._last_triggered_step >= self._wait_steps) and \ + step - self._last_triggered_step + >= self._minimum_interval_steps) and \ self._head_queue.full() and self._rear_queue.full() and \ self._head_sum - self._rear_sum <= self._n * self._threshold: self._last_triggered_step = step From b293bc17d5518d88f7f18aa1f0f3480861ecfcf1 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sat, 13 Oct 2018 21:23:09 -0400 Subject: [PATCH 24/65] add trigger save & restore (not tested yet) --- .../differentiable_expected_bleu.py | 47 +++++++++++++----- .../differentiable_expected_bleu/triggers.py | 48 +++++++++++++++++-- 2 files changed, 80 insertions(+), 15 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index d5db0664..10b86d22 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -235,33 +235,44 @@ def _eval_epoch(sess, summary_writer, mode, trigger): best_val_bleu = -1 with tf.Session() as sess: + if pretraining: + trigger = None + else: + action = (tm_helper.assign_mask_pattern(sess, n_unmask, n_mask) + for n_unmask, n_mask in mask_patterns[1:]) + trigger = BestEverConvergenceTrigger( + action, + config_train.threshold_steps, + config_train.minimum_interval_steps, + default=None) + sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) + dir_model = os.path.join(expr_name, 'ckpt') dir_best = os.path.join(expr_name, 'ckpt-best') ckpt_model = os.path.join(dir_model, 'model.ckpt') ckpt_best = os.path.join(dir_best, 'model.ckpt') + if os.path.exists(dir_model): ckpt_path = tf.train.latest_checkpoint(dir_model) print('restoring from {} ...'.format(ckpt_path)) optimistic_restore(sess, ckpt_path) + + if trigger is not None: + trigger_path = '{}.trigger'.format(ckpt_path) + if os.path.exists(trigger_path): + with open(trigger_path, 'r') as pickle_file: + trigger.restore_from_pickle(pickle_file) + else: + print('cannot find previous trigger state.') + print('done.') summary_writer = tf.summary.FileWriter( os.path.join(expr_name, 'log'), sess.graph, flush_secs=30) - if pretraining: - trigger = None - else: - action = (tm_helper.assign_mask_pattern(sess, n_unmask, n_mask) - for n_unmask, n_mask in mask_patterns[1:]) - trigger = BestEverConvergenceTrigger( - action, - config_train.threshold_steps, - config_train.minimum_interval_steps, - default=None) - epoch = 0 while epoch < config_train.max_epochs: print('epoch #{}:'.format(epoch)) @@ -272,12 +283,24 @@ def _eval_epoch(sess, summary_writer, mode, trigger): print('epoch: {}, step: {}, best val bleu: {}'.format( epoch, step, best_val_bleu)) saved_path = saver.save( - sess, os.path.join(ckpt_best), global_step=step) + sess, ckpt_best, global_step=step) + + if trigger is not None: + with open('{}.trigger'.format(ckpt_best), 'w') as \ + pickle_file: + trigger.save_to_pickle(pickle_file) + print('saved to {}'.format(saved_path)) + _train_epoch(sess, summary_writer, train_op, trigger) epoch += 1 step = tf.train.global_step(sess, global_step) saved_path = saver.save(sess, ckpt_model, global_step=step) + + if trigger is not None: + with open('{}.trigger'.format(ckpt_model), 'w') as pickle_file: + trigger.save_to_pickle(pickle_file) + print('saved to {}'.format(saved_path)) diff --git a/examples/differentiable_expected_bleu/triggers.py b/examples/differentiable_expected_bleu/triggers.py index 65efe8ca..63aad56c 100644 --- a/examples/differentiable_expected_bleu/triggers.py +++ b/examples/differentiable_expected_bleu/triggers.py @@ -18,13 +18,15 @@ from __future__ import print_function from __future__ import division -#pylint: disable=invalid-name, too-many-arguments, too-many-locals +import pickle try: import queue except ImportError: import Queue as queue +#pylint: disable=invalid-name, too-many-arguments, too-many-locals + DEFAULT = object() class Trigger(object): @@ -36,21 +38,50 @@ def __init__(self, action, default=DEFAULT): """ self._action = iter(action) self._default = default + self._triggered_times = 0 def _predicate(self, *args, **kwargs): """This function returns True when we think we should do something. """ raise NotImplementedError + def _next_action(self): + return next(self._action) if self._default is DEFAULT else \ + next(self._action, self._default) + def __call__(self, *args, **kwargs): pred = self._predicate(*args, **kwargs) if pred: - ret = next(self._action) if self._default is DEFAULT else \ - next(self._action, self._default) + ret = self._next_action() + self._triggered_times += 1 else: ret = None return pred, ret + def _make_state(self, names): + return {name: getattr(self, name) for name in names} + + @property + def _state_names(self): + return ['_triggered_times'] + + @property + def state(self): + return self._make_state(self._state_names) + + def restore_from_state(self, state): + for name, value in state.items(): + setattr(self, name, value) + + for t in range(self._triggered_times): + self._next_action() + + def save_to_pickle(self, file): + pickle.dump(self.state, file) + + def restore_from_pickle(self, file): + self.restore_from_state(pickle.load(file)) + class ScheduledStepsTrigger(Trigger): @@ -96,6 +127,11 @@ def _predicate(self, step, score): return True return False + @property + def _state_names(self): + return super(BestEverConvergenceTrigger, self)._state_names + [ + '_last_triggered_step', '_best_ever_step', '_best_ever_score'] + class MovingAverageConvergenceTrigger(Trigger): @@ -130,3 +166,9 @@ def _predicate(self, step, score): self._last_triggered_step = step return True return False + + @property + def _state_names(self): + return super(BestEverConvergenceTrigger, self)._state_names + [ + '_last_triggered_step', '_head_queue', '_head_sum', '_rear_queue', + '_rear_sum'] From 9b2b38212cbe5a2ec521dbcfcddefe07d64a0177 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sat, 13 Oct 2018 21:34:37 -0400 Subject: [PATCH 25/65] move module triggers into texar/utils --- .../differentiable_expected_bleu.py | 3 +-- texar/utils/__init__.py | 1 + .../utils}/triggers.py | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) rename {examples/differentiable_expected_bleu => texar/utils}/triggers.py (94%) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 10b86d22..b578d2e7 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -24,7 +24,6 @@ import os import tensorflow as tf import texar as tx -from triggers import BestEverConvergenceTrigger flags = tf.flags @@ -240,7 +239,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger): else: action = (tm_helper.assign_mask_pattern(sess, n_unmask, n_mask) for n_unmask, n_mask in mask_patterns[1:]) - trigger = BestEverConvergenceTrigger( + trigger = tx.utils.BestEverConvergenceTrigger( action, config_train.threshold_steps, config_train.minimum_interval_steps, diff --git a/texar/utils/__init__.py b/texar/utils/__init__.py index d22e2050..ab284e9c 100644 --- a/texar/utils/__init__.py +++ b/texar/utils/__init__.py @@ -29,3 +29,4 @@ from texar.utils.mode import * from texar.utils.average_recorder import * from texar.utils.utils_io import * +from texar.utils.triggers import * diff --git a/examples/differentiable_expected_bleu/triggers.py b/texar/utils/triggers.py similarity index 94% rename from examples/differentiable_expected_bleu/triggers.py rename to texar/utils/triggers.py index 63aad56c..efa339e0 100644 --- a/examples/differentiable_expected_bleu/triggers.py +++ b/texar/utils/triggers.py @@ -27,11 +27,11 @@ #pylint: disable=invalid-name, too-many-arguments, too-many-locals -DEFAULT = object() +DEFAULT_ACTION = object() class Trigger(object): - def __init__(self, action, default=DEFAULT): + def __init__(self, action, default=DEFAULT_ACTION): """action is an iterator that iteratively do a sequence of action and return result values. default is used as result value when action is exhausted. @@ -46,7 +46,7 @@ def _predicate(self, *args, **kwargs): raise NotImplementedError def _next_action(self): - return next(self._action) if self._default is DEFAULT else \ + return next(self._action) if self._default is DEFAULT_ACTION else \ next(self._action, self._default) def __call__(self, *args, **kwargs): @@ -85,7 +85,7 @@ def restore_from_pickle(self, file): class ScheduledStepsTrigger(Trigger): - def __init__(self, action, steps, default=DEFAULT): + def __init__(self, action, steps, default=DEFAULT_ACTION): """steps should be in increasing order. """ super(ScheduledTrigger, self).__init__(action, default) @@ -106,7 +106,7 @@ def _predicate(self, step): class BestEverConvergenceTrigger(Trigger): def __init__(self, action, threshold_steps, minimum_interval_steps, - default=DEFAULT): + default=DEFAULT_ACTION): super(BestEverConvergenceTrigger, self).__init__(action, default) self._threshold_steps = threshold_steps self._minimum_interval_steps = minimum_interval_steps @@ -136,7 +136,7 @@ def _state_names(self): class MovingAverageConvergenceTrigger(Trigger): def __init__(self, action, n, threshold, minimum_interval_steps, - default=DEFAULT): + default=DEFAULT_ACTION): super(MovingAverageConvergenceTrigger, self).__init__(action, default) self._n = n self._threshold = threshold From 190d5b3beb4af581e56d3dc7c60b47ad1403b345 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sat, 13 Oct 2018 22:54:19 -0400 Subject: [PATCH 26/65] refine codes --- .../differentiable_expected_bleu.py | 140 ++++++++---------- texar/utils/triggers.py | 1 + 2 files changed, 65 insertions(+), 76 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index b578d2e7..782846a7 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -30,54 +30,18 @@ flags.DEFINE_string("config_train", "config_train", "The training config.") flags.DEFINE_string("config_model", "config_model", "The model config.") flags.DEFINE_string("config_data", "config_iwslt14", "The dataset config.") -flags.DEFINE_boolean("pretraining", False, "Whether pretraining.") -flags.DEFINE_boolean("restore_adam", False, "Whether to restore Adam states.") -flags.DEFINE_boolean("restore_mask", False, "Whether to restore mask patterns.") +flags.DEFINE_integer("pretrain_epochs", 8, "Number of pretraining epochs.") FLAGS = flags.FLAGS config_train = importlib.import_module(FLAGS.config_train) config_model = importlib.import_module(FLAGS.config_model) config_data = importlib.import_module(FLAGS.config_data) -pretraining = FLAGS.pretraining -restore_adam = FLAGS.restore_adam -restore_mask = FLAGS.restore_mask +pretrain_epochs = FLAGS.pretrain_epochs expr_name = config_train.expr_name mask_patterns = config_train.mask_patterns -def optimistic_restore(session, save_file, graph=tf.get_default_graph()): - reader = tf.train.NewCheckpointReader(save_file) - saved_shapes = reader.get_variable_to_shape_map() - var_names = sorted([ - (var.name, var.name.split(':')[0]) for var in tf.global_variables() - if var.name.split(':')[0] in saved_shapes]) - restore_vars = [] - for var_name, saved_var_name in var_names: - curr_var = graph.get_tensor_by_name(var_name) - var_shape = curr_var.get_shape().as_list() - if var_shape == saved_shapes[saved_var_name]: - restore_vars.append(curr_var) - if not restore_adam: - restore_vars = list(filter( - lambda var: var.name.split(':')[0].split('/')[0] != 'OptimizeLoss', - restore_vars)) - if not restore_mask: - restore_vars = list(filter( - lambda var: var.name.split(':')[0].split('/')[0] not in \ - ['n_unmask', 'n_mask'], - restore_vars)) - print('restoring variables:\n{}'.format('\n'.join( - var.name for var in restore_vars))) - opt_saver = tf.train.Saver(restore_vars) - opt_saver.restore(session, save_file) - -def get_data_loader(sess, fetches, feed_dict): - while True: - try: - yield sess.run(fetches, feed_dict=feed_dict) - except tf.errors.OutOfRangeError: - break def build_model(batch, train_data): """Assembles the seq2seq model. @@ -110,6 +74,10 @@ def build_model(batch, train_data): logits=tf_outputs.logits, sequence_length=batch['target_length']-1) + #TODO: find a way to reset Adam state at the lr decay point + #restore_vars = list(filter( + # lambda var: var.name.split(':')[0].split('/')[0] != 'OptimizeLoss', + # restore_vars)) train_xe_op = tx.core.get_train_op( loss_xe, hparams=config_train.train_xe) @@ -167,7 +135,6 @@ def main(): train_xe_op, train_debleu_op, tm_helper, infer_outputs = \ build_model(data_batch, train_data) - train_op = train_xe_op if pretraining else train_debleu_op tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask) tf.summary.scalar('tm/n_mask', tm_helper.n_mask) @@ -178,22 +145,31 @@ def main(): def _train_epoch(sess, summary_writer, train_op, trigger): print('in _train_epoch') + data_iterator.restart_dataset(sess, 'train') feed_dict = { tx.global_mode(): tf.estimator.ModeKeys.TRAIN, data_iterator.handle: data_iterator.get_handle(sess, 'train') } - for loss, summary, step in get_data_loader( - sess, (train_op, merged_summary, global_step), feed_dict): - summary_writer.add_summary(summary, step) - if step % config_train.steps_per_eval == 0: - _eval_epoch(sess, summary_writer, 'val', trigger) + while True: + try: + loss, summary, step = sess.run( + (train_op, merged_summary, global_step), feed_dict) + + summary_writer.add_summary(summary, step) + + if step % config_train.steps_per_eval == 0: + _eval_epoch(sess, summary_writer, 'val', trigger) + + except tf.errors.OutOfRangeError: + break print('end _train_epoch') def _eval_epoch(sess, summary_writer, mode, trigger): print('in _eval_epoch with mode {}'.format(mode)) + data_iterator.restart_dataset(sess, mode) feed_dict = { tx.global_mode(): tf.estimator.ModeKeys.EVAL, @@ -205,20 +181,27 @@ def _eval_epoch(sess, summary_writer, mode, trigger): data_batch['target_text'][:, 1:], infer_outputs.predicted_ids[:, :, 0] ] - for target_texts_ori, output_ids in \ - get_data_loader(sess, fetches, feed_dict): - target_texts = tx.utils.strip_special_tokens(target_texts_ori) - output_texts = tx.utils.map_ids_to_strs( - ids=output_ids, vocab=val_data.target_vocab) - ref_hypo_pairs.extend( - zip(map(lambda x: [x], target_texts), output_texts)) + while True: + try: + target_texts_ori, output_ids = sess.run(fetches, feed_dict) + target_texts = tx.utils.strip_special_tokens(target_texts_ori) + output_texts = tx.utils.map_ids_to_strs( + ids=output_ids, vocab=val_data.target_vocab) + + ref_hypo_pairs.extend( + zip(map(lambda x: [x], target_texts), output_texts)) + + except tf.errors.OutOfRangeError: + break refs, hypos = zip(*ref_hypo_pairs) bleu = tx.evals.corpus_bleu_moses(list_of_references=refs, hypotheses=hypos) + print('{} BLEU: {}'.format(mode, bleu)) step = tf.train.global_step(sess, global_step) + summary = tf.Summary() summary.value.add(tag='{}/BLEU'.format(mode), simple_value=bleu) summary_writer.add_summary(summary, step) @@ -234,16 +217,13 @@ def _eval_epoch(sess, summary_writer, mode, trigger): best_val_bleu = -1 with tf.Session() as sess: - if pretraining: - trigger = None - else: - action = (tm_helper.assign_mask_pattern(sess, n_unmask, n_mask) - for n_unmask, n_mask in mask_patterns[1:]) - trigger = tx.utils.BestEverConvergenceTrigger( - action, - config_train.threshold_steps, - config_train.minimum_interval_steps, - default=None) + action = (tm_helper.assign_mask_pattern(sess, n_unmask, n_mask) + for n_unmask, n_mask in mask_patterns[1:]) + trigger = tx.utils.BestEverConvergenceTrigger( + action, + config_train.threshold_steps, + config_train.minimum_interval_steps, + default=None) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) @@ -257,15 +237,14 @@ def _eval_epoch(sess, summary_writer, mode, trigger): if os.path.exists(dir_model): ckpt_path = tf.train.latest_checkpoint(dir_model) print('restoring from {} ...'.format(ckpt_path)) - optimistic_restore(sess, ckpt_path) + saver.restore(sess, ckpt_path) - if trigger is not None: - trigger_path = '{}.trigger'.format(ckpt_path) - if os.path.exists(trigger_path): - with open(trigger_path, 'r') as pickle_file: - trigger.restore_from_pickle(pickle_file) - else: - print('cannot find previous trigger state.') + trigger_path = '{}.trigger'.format(ckpt_path) + if os.path.exists(trigger_path): + with open(trigger_path, 'r') as pickle_file: + trigger.restore_from_pickle(pickle_file) + else: + print('cannot find previous trigger state.') print('done.') @@ -274,29 +253,38 @@ def _eval_epoch(sess, summary_writer, mode, trigger): epoch = 0 while epoch < config_train.max_epochs: - print('epoch #{}:'.format(epoch)) + pretraining = epoch < pretrain_epochs + print('epoch #{}{}:'.format( + epoch, ' (pretraining)' if pretraining else '')) + val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger) + step = tf.train.global_step(sess, global_step) + print('epoch: {}, step: {}, val bleu: {}'.format( + epoch, step, val_bleu)) + if val_bleu > best_val_bleu: best_val_bleu = val_bleu - step = tf.train.global_step(sess, global_step) - print('epoch: {}, step: {}, best val bleu: {}'.format( - epoch, step, best_val_bleu)) + print('update best val bleu: {}'.format(best_val_bleu)) + saved_path = saver.save( sess, ckpt_best, global_step=step) - if trigger is not None: + if not pretraining: with open('{}.trigger'.format(ckpt_best), 'w') as \ pickle_file: trigger.save_to_pickle(pickle_file) print('saved to {}'.format(saved_path)) - _train_epoch(sess, summary_writer, train_op, trigger) + train_op = train_xe_op if pretraining else train_debleu_op + _train_epoch(sess, summary_writer, train_op, + None if pretraining else trigger) epoch += 1 + step = tf.train.global_step(sess, global_step) saved_path = saver.save(sess, ckpt_model, global_step=step) - if trigger is not None: + if not pretraining: with open('{}.trigger'.format(ckpt_model), 'w') as pickle_file: trigger.save_to_pickle(pickle_file) diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py index efa339e0..af814029 100644 --- a/texar/utils/triggers.py +++ b/texar/utils/triggers.py @@ -29,6 +29,7 @@ DEFAULT_ACTION = object() + class Trigger(object): def __init__(self, action, default=DEFAULT_ACTION): From a4fdd5a7b78a765e4c8f00d66c569d981350f440 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sat, 13 Oct 2018 23:30:31 -0400 Subject: [PATCH 27/65] add comments to debleu.py --- texar/losses/debleu.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/texar/losses/debleu.py b/texar/losses/debleu.py index eeb9ba04..51dffe9c 100644 --- a/texar/losses/debleu.py +++ b/texar/losses/debleu.py @@ -144,23 +144,25 @@ def debleu(labels, probs, sequence_length, time_major=False, """ # TODO: rewrite example with tf.name_scope(name, "debleu"): - X = probs - Y = labels + X = probs # p_theta(y) + Y = labels # y* if time_major: X = tf.transpose(X, [1, 0, 2]) Y = tf.transpose(Y, [1, 0]) - sizeX = tf.shape(X)[1] - sizeY = tf.shape(Y)[1] + T_X = tf.shape(X)[1] # max T + T_Y = tf.shape(Y)[1] # max T* - XY = batch_gather(X, tf.tile(tf.expand_dims(Y, 1), [1, sizeX, 1])) + # XY denotes p(y_i=y*_j) + XY = batch_gather(X, tf.tile(tf.expand_dims(Y, 1), [1, T_X, 1])) + # YY denotes 1(y*_j=y*_j') YY = tf.to_float(tf.equal(tf.expand_dims(Y, 2), tf.expand_dims(Y, 1))) maskX = tf.sequence_mask( - sequence_length + 1, maxlen=sizeX + 1, dtype=tf.float32) + sequence_length + 1, maxlen=T_X + 1, dtype=tf.float32) maskY = tf.sequence_mask( - sequence_length + 1, maxlen=sizeY + 1, dtype=tf.float32) + sequence_length + 1, maxlen=T_Y + 1, dtype=tf.float32) matchXY = tf.expand_dims(maskX, 2) * tf.expand_dims(maskY, 1) matchYY = tf.minimum(tf.expand_dims(maskY, 2), tf.expand_dims(maskY, 1)) @@ -168,26 +170,29 @@ def debleu(labels, probs, sequence_length, time_major=False, tot = [] o = [] - for order in range(max_order): - matchXY = XY[:, : sizeX - order, : sizeY - order] * \ - matchXY[:, 1:, 1:] - matchYY = YY[:, : sizeY - order, : sizeY - order] * \ - matchYY[:, 1:, 1:] + for order in range(max_order): # order = n - 1 + # Eq.20 + matchXY = XY[:, : T_X - order, : T_Y - order] * matchXY[:, 1:, 1:] + matchYY = YY[:, : T_Y - order, : T_Y - order] * matchYY[:, 1:, 1:] cntYX = tf.reduce_sum(matchXY, 1, keepdims=True) cntYY = tf.reduce_sum(matchYY, 1, keepdims=True) + # Eq.14 o_order = tf.reduce_sum(tf.reduce_sum( min_fn(cntYY / (cntYX - matchXY + 1)) * matchXY / tf.maximum(1., cntYY), 2), 1) - # in order to avoid being divided by 0 + # calculate (T - n + 1); max(1, .) is to avoid being divided by 0 tot_order = tf.maximum(1, sequence_length - order) tot.append(tot_order) o.append(o_order) tot = tf.stack(tot, 1) o = tf.stack(o, 1) + # Eq.15 prec = tf.reduce_sum(o, 0) / tf.to_float(tf.reduce_sum(tot, 0)) + # add epsilon in order to avoid inf gradient neglog_prec = -tf.log(prec + epsilon) + # Eq.17; constant about BP is omitted loss = tf.reduce_sum(weights * neglog_prec, 0) return loss From 77c0a52bf1988f14ed7e180ea5c9f289fea5e156 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sun, 14 Oct 2018 01:35:09 -0400 Subject: [PATCH 28/65] add name_scope to TeacherMaskSoftmaxEmbeddingHelper --- texar/modules/decoders/rnn_decoder_helpers.py | 53 ++++++++++--------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py index c12447d7..99885440 100644 --- a/texar/modules/decoders/rnn_decoder_helpers.py +++ b/texar/modules/decoders/rnn_decoder_helpers.py @@ -333,30 +333,35 @@ def sample(self, time, outputs, state, name=None): class TeacherMaskSoftmaxEmbeddingHelper(TFTrainingHelper): def __init__(self, inputs, sequence_length, embedding, n_unmask, n_mask, tau=1., time_major=False, seed=None, - stop_gradient=False): - super(TeacherMaskSoftmaxEmbeddingHelper, self).__init__( - inputs=inputs, - sequence_length=sequence_length, - time_major=time_major) - - self._embedding, self._embedding_fn = get_embedding_and_fn(embedding) - self._tau = tau - self._seed = seed - self._stop_gradient = stop_gradient - - self._zero_next_inputs = tf.zeros_like( - self._embedding_fn(self._zero_inputs)) - - self._n_unmask = tf.Variable(n_unmask, name='n_unmask') - self._n_mask = tf.Variable(n_mask, name='n_mask') - self._n_cycle = tf.add(self._n_unmask, self._n_mask, name='n_cycle') - self._new_n_unmask = tf.placeholder(shape=[], dtype=tf.int32) - self._new_n_mask = tf.placeholder(shape=[], dtype=tf.int32) - self._assign_n_unmask = tf.assign(self._n_unmask, self._new_n_unmask) - self._assign_n_mask = tf.assign(self._n_mask, self._new_n_mask) - self._n_shift = tf.random_uniform( - [], maxval=self._n_cycle, dtype=self._n_cycle.dtype, - seed=self._seed, name='n_shift') + stop_gradient=False, name=None): + with tf.name_scope(name, "TeacherMaskSoftmaxEmbeddingHelper", + [embedding, tau, seed, stop_gradient]): + super(TeacherMaskSoftmaxEmbeddingHelper, self).__init__( + inputs=inputs, + sequence_length=sequence_length, + time_major=time_major) + + self._embedding, self._embedding_fn = get_embedding_and_fn( + embedding) + self._tau = tau + self._seed = seed + self._stop_gradient = stop_gradient + + self._zero_next_inputs = tf.zeros_like( + self._embedding_fn(self._zero_inputs)) + + self._n_unmask = tf.Variable(n_unmask, name='n_unmask') + self._n_mask = tf.Variable(n_mask, name='n_mask') + self._n_cycle = tf.add( + self._n_unmask, self._n_mask, name='n_cycle') + self._new_n_unmask = tf.placeholder(shape=[], dtype=tf.int32) + self._new_n_mask = tf.placeholder(shape=[], dtype=tf.int32) + self._assign_n_unmask = tf.assign( + self._n_unmask, self._new_n_unmask) + self._assign_n_mask = tf.assign(self._n_mask, self._new_n_mask) + self._n_shift = tf.random_uniform( + [], maxval=self._n_cycle, dtype=self._n_cycle.dtype, + seed=self._seed, name='n_shift') @property def sample_ids_dtype(self): From c70b8e256134537cc5fdbf280a9ca28fafa92d93 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sun, 14 Oct 2018 01:44:01 -0400 Subject: [PATCH 29/65] fix lr decay boundaries --- examples/differentiable_expected_bleu/config_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py index fe175a22..ccdde330 100644 --- a/examples/differentiable_expected_bleu/config_train.py +++ b/examples/differentiable_expected_bleu/config_train.py @@ -15,7 +15,7 @@ "learning_rate_decay": { "type": "piecewise_constant", "kwargs": { - "boundaries": [160000], + "boundaries": [10000], "values": [1e-3, 1e-5], }, }, From 6daaac8ed89929ec56e15945cd571d9004f9ac68 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sun, 14 Oct 2018 13:32:53 -0400 Subject: [PATCH 30/65] fix save trigger path --- .../differentiable_expected_bleu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 782846a7..86eec55f 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -270,7 +270,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger): sess, ckpt_best, global_step=step) if not pretraining: - with open('{}.trigger'.format(ckpt_best), 'w') as \ + with open('{}.trigger'.format(saved_path), 'w') as \ pickle_file: trigger.save_to_pickle(pickle_file) @@ -285,7 +285,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger): saved_path = saver.save(sess, ckpt_model, global_step=step) if not pretraining: - with open('{}.trigger'.format(ckpt_model), 'w') as pickle_file: + with open('{}.trigger'.format(saved_path), 'w') as pickle_file: trigger.save_to_pickle(pickle_file) print('saved to {}'.format(saved_path)) From afacfe948f9b1999b678355f3fb61efcf27f3a9f Mon Sep 17 00:00:00 2001 From: wwt Date: Sun, 14 Oct 2018 16:48:01 -0400 Subject: [PATCH 31/65] add docs --- docs/code/losses.rst | 8 +++ docs/code/modules.rst | 5 ++ docs/code/utils.rst | 13 ++++ .../differentiable_expected_bleu.py | 4 -- texar/losses/debleu.py | 56 ++++++++++------ texar/modules/decoders/rnn_decoder_helpers.py | 65 ++++++++++++++++++- texar/utils/triggers.py | 32 +++++++++ 7 files changed, 158 insertions(+), 25 deletions(-) diff --git a/docs/code/losses.rst b/docs/code/losses.rst index df1a14a3..87a0c6b0 100644 --- a/docs/code/losses.rst +++ b/docs/code/losses.rst @@ -68,6 +68,14 @@ Entropy .. autofunction:: texar.losses.sequence_entropy_with_logits +DEBLEU +================== + +:hidden:`debleu` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: texar.losses.debleu + + Loss Utils =========== diff --git a/docs/code/modules.rst b/docs/code/modules.rst index d3d6443e..5aac39c0 100644 --- a/docs/code/modules.rst +++ b/docs/code/modules.rst @@ -134,6 +134,11 @@ Decoders .. autoclass:: texar.modules.GumbelSoftmaxEmbeddingHelper :members: +:hidden:`TeacherMaskSoftmaxEmbeddingHelper` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: texar.modules.TeacherMaskSoftmaxEmbeddingHelper + :members: + :hidden:`get_helper` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: texar.modules.get_helper diff --git a/docs/code/utils.rst b/docs/code/utils.rst index 5c113c1a..c463c752 100644 --- a/docs/code/utils.rst +++ b/docs/code/utils.rst @@ -278,3 +278,16 @@ AverageRecorder ========================== .. autoclass:: texar.utils.AverageRecorder :members: + +Trigger +========================== + +:hidden:`Trigger` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: texar.utils.Trigger + :members: + +:hidden:`BestEverConvergenceTrigger` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: texar.utils.BestEverConvergenceTrigger + :members: diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 86eec55f..ee12b3dc 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -74,10 +74,6 @@ def build_model(batch, train_data): logits=tf_outputs.logits, sequence_length=batch['target_length']-1) - #TODO: find a way to reset Adam state at the lr decay point - #restore_vars = list(filter( - # lambda var: var.name.split(':')[0].split('/')[0] != 'OptimizeLoss', - # restore_vars)) train_xe_op = tx.core.get_train_op( loss_xe, hparams=config_train.train_xe) diff --git a/texar/losses/debleu.py b/texar/losses/debleu.py index 51dffe9c..1d1307db 100644 --- a/texar/losses/debleu.py +++ b/texar/losses/debleu.py @@ -99,32 +99,42 @@ def batch_gather(params, indices, name=None): def debleu(labels, probs, sequence_length, time_major=False, min_fn=lambda x: tf.minimum(1., x), max_order=4, weights=[.1, .3, .3, .3], epsilon=1e-9, name=None): - """Computes sparse softmax cross entropy for each time step of sequence - predictions. + """Computes Differentiable Expected BLEU (DEBLEU). See + https://openreview.net/pdf?id=S1x2aiRqFX for details. Args: - labels: Target class indexes. I.e., classes are mutually exclusive - (each entry is in exactly one class). + labels: Target sequence token indexes, i.e. y* in the paper. - If :attr:`time_major` is `False` (default), this must be\ - a Tensor of shape `[batch_size, max_time]`. + a tensor of shape `[batch_size, max_time]`. - - If `time_major` is `True`, this must be a Tensor of shape\ + - If `time_major` is `True`, this must be a tensor of shape\ `[max_time, batch_size].` - logits: Unscaled log probabilities. This must have the shape of - `[max_time, batch_size, num_classes]` or - `[batch_size, max_time, num_classes]` according to + probs: Probabilities generated by model, i.e. y in the paper. This must + have the shape of + `[max_time, batch_size, vocab_size]` or + `[batch_size, max_time, vocab_size]` according to the value of `time_major`. - sequence_length: A Tensor of shape `[batch_size]`. Time steps beyond + sequence_length: A tensor of shape `[batch_size]`. Time steps beyond the respective sequence lengths will have zero losses. time_major (bool): The shape format of the inputs. If `True`, - :attr:`labels` and :attr:`logits` must have shape + :attr:`labels` and :attr:`probs` must have shape `[max_time, batch_size, ...]`. If `False` (default), they must have shape `[batch_size, max_time, ...]`. + min_fn (function, optional): A python function that implements the min + operation in Eq.14 in the paper. Default to tf.minimum(1., x). + max_order (int, optional): Maximum order of grams calculated. Default + to 4. + weights (optional): A tensor (or simply Python list) of shape + `[max_order]` of which the i-th scalar is the weight of (i+1) gram + precision. Default to `[0.1, 0.3, 0.3, 0.3]`. + epsilon (float, optional): A small value added before applying + logarithm in Eq.17 in the paper. This is in order to avoid infinite + gradients. Default to 1e-9. name (str, optional): A name for the operation. Returns: - A Tensor containing the loss of rank 0. + A tensor containing the loss of rank 0. Example: @@ -132,17 +142,23 @@ def debleu(labels, probs, sequence_length, time_major=False, embedder = WordEmbedder(vocab_size=data.vocab.size) decoder = BasicRNNDecoder(vocab_size=data.vocab.size) - outputs, _, _ = decoder( - decoding_strategy='train_greedy', - inputs=embedder(data_batch['text_ids']), - sequence_length=data_batch['length']-1) - - loss = sequence_sparse_softmax_cross_entropy( + + tm_helper = texar.modules.TeacherMaskSoftmaxEmbeddingHelper( + inputs=data_batch['text_ids'], + sequence_length=data_batch['length']-1, + embedding=embedder, + n_unmask=1, + n_mask=0, + tau=1.) + + outputs, _, _ = decoder(helper=tm_helper) + + loss = debleu( labels=data_batch['text_ids'][:, 1:], - logits=outputs.logits, + probs=outputs.sample_ids, sequence_length=data_batch['length']-1) - """ # TODO: rewrite example + """ with tf.name_scope(name, "debleu"): X = probs # p_theta(y) Y = labels # y* diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py index 99885440..f8e7040e 100644 --- a/texar/modules/decoders/rnn_decoder_helpers.py +++ b/texar/modules/decoders/rnn_decoder_helpers.py @@ -331,6 +331,66 @@ def sample(self, time, outputs, state, name=None): class TeacherMaskSoftmaxEmbeddingHelper(TFTrainingHelper): + """A helper that implements the Teacher Mask described in the paper + https://openreview.net/pdf?id=S1x2aiRqFX. In an unmasked step, it feeds + softmax probabilities over vocabulary to the next step. In a masked step, + it feeds the one-hot distribution of the target labels (:attr:`inputs`) + to the next step. + Uses the softmax probability or one-hot vector to pass through word + embeddings to get the next input (i.e., a mixed word embedding). + In this implementation, all sequences in a batch shares the same teacher + mask. + + A subclass of + :tf_main:`TrainingHelper `. + Used as a helper to :class:`~texar.modules.RNNDecoderBase` :meth:`_build` + in training mode. + + Args: + inputs (2D Tensor): Target sequence token indexes. It should be a tensor + of shape `[batch_size, max_time]`. Must append both BOS and EOS + tokens to each sequence. + sequence_length (1D Tensor): Lengths of input token sequences. These + lengths should include the BOS tokens but exclude the EOS tokens. + embedding: An embedding argument (:attr:`params`) for + :tf_main:`tf.nn.embedding_lookup `, or an + instance of subclass of :class:`texar.modules.EmbedderBase`. + Note that other callables are not acceptable here. + n_unmask: An int scalar tensor denotes the mask pattern together with + :attr:`n_mask`. See the paper for details. + n_mask: An int scalar tensor denotes the mask pattern together with + :attr:`n_unmask`. See the paper for details. + tau (float, optional): A float scalar tensor, the softmax temperature. + Default to 1. + seed (int, optional): The random seed used to shift the mask. + stop_gradient (bool): Whether to stop the gradient backpropagation + when feeding softmax vector to the next step. + name (str, optional): A name for the module. + + Example: + + .. code-block:: python + + embedder = WordEmbedder(vocab_size=data.vocab.size) + decoder = BasicRNNDecoder(vocab_size=data.vocab.size) + + tm_helper = texar.modules.TeacherMaskSoftmaxEmbeddingHelper( + inputs=data_batch['text_ids'], + sequence_length=data_batch['length']-1, + embedding=embedder, + n_unmask=1, + n_mask=0, + tau=1.) + + outputs, _, _ = decoder(helper=tm_helper) + + loss = debleu( + labels=data_batch['text_ids'][:, 1:], + probs=outputs.sample_ids, + sequence_length=data_batch['length']-1) + + """ + def __init__(self, inputs, sequence_length, embedding, n_unmask, n_mask, tau=1., time_major=False, seed=None, stop_gradient=False, name=None): @@ -397,7 +457,10 @@ def initialize(self, name=None): return (finished, next_inputs) def sample(self, time, outputs, state, name=None): - """Returns `sample_id` of shape `[batch_size, vocab_size]`. + """Returns `sample_id` of shape `[batch_size, vocab_size]`. In an + unmasked step, it is softmax distributions over vocabulary with + temperature :attr:`tau`; in a masked step, it is one-hot + representations of :attr:`input` in the next step. """ next_time = time + 1 sample_ids = tf.cond( diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py index af814029..e4f8f967 100644 --- a/texar/utils/triggers.py +++ b/texar/utils/triggers.py @@ -27,10 +27,30 @@ #pylint: disable=invalid-name, too-many-arguments, too-many-locals +__all__ = [ + "Trigger", + "BestEverConvergenceTrigger", +] + + DEFAULT_ACTION = object() class Trigger(object): + """A trigger can do some action when certain condition is met. + Specifically, the user calls the trigger periodically. Every time the + trigger is called, it will send all arguments to :meth:`_predicate`, which + returns a boolean value indicates whether the condition is met. Once the + condition is met, the trigger will then call `next(action)` to do next + action and obtain the returned value. + + Args: + action (iterable): An iterable which does the action and possibly + returns a value. + default: The value returned after :attr:`action` stops iteration. If + not provided, the trigger will do nothing when StopIteration + occurs. + """ def __init__(self, action, default=DEFAULT_ACTION): """action is an iterator that iteratively do a sequence of action and @@ -68,9 +88,21 @@ def _state_names(self): @property def state(self): + """The current state which can be used to save and restore the trigger. + The state records how many times `next(action)` has been called. + """ return self._make_state(self._state_names) def restore_from_state(self, state): + """Restore the trigger state from the previous stored state. + Note that this function will call `next(action)` for the exact times + that the :py:attr:`state` records how many times `next(action)` had + been called. The user should be aware of any possible side effect of + this behavior. + + Args: + state: The state previously obtained by :py:attr:`state`. + """ for name, value in state.items(): setattr(self, name, value) From 0794ddcbbd9b8e631c1624b596a5b833d3b26b4f Mon Sep 17 00:00:00 2001 From: wwt Date: Sun, 14 Oct 2018 21:17:45 -0400 Subject: [PATCH 32/65] add more trigger docs --- texar/utils/triggers.py | 82 +++++++++++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 15 deletions(-) diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py index e4f8f967..ce3c7183 100644 --- a/texar/utils/triggers.py +++ b/texar/utils/triggers.py @@ -37,32 +37,29 @@ class Trigger(object): - """A trigger can do some action when certain condition is met. - Specifically, the user calls the trigger periodically. Every time the - trigger is called, it will send all arguments to :meth:`_predicate`, which - returns a boolean value indicates whether the condition is met. Once the - condition is met, the trigger will then call `next(action)` to do next - action and obtain the returned value. + """This is the base class of all triggers. A trigger can do some action when + certain condition is met. Specifically, the user calls the trigger + periodically. Every time the trigger is called, it will send all arguments + to :meth:`_predicate`, which returns a boolean value indicates whether the + condition is met. Once the condition is met, the trigger will then call + `next(action)` to do next action and obtain the returned value. Args: - action (iterable): An iterable which does the action and possibly - returns a value. - default: The value returned after :attr:`action` stops iteration. If - not provided, the trigger will do nothing when StopIteration + action (iterable): An iterable which iteratively does the action and + possibly returns a value. + default (optional): The value returned after :attr:`action` exhausted. + If not provided, the trigger will do nothing when `StopIteration` occurs. """ def __init__(self, action, default=DEFAULT_ACTION): - """action is an iterator that iteratively do a sequence of action and - return result values. default is used as result value when action is - exhausted. - """ self._action = iter(action) self._default = default self._triggered_times = 0 def _predicate(self, *args, **kwargs): - """This function returns True when we think we should do something. + """This function returns True when the condition is met and we should + do something. """ raise NotImplementedError @@ -84,6 +81,9 @@ def _make_state(self, names): @property def _state_names(self): + """Returns a list of names of attributes of the trigger object that can + be saved and restored as trigger state. + """ return ['_triggered_times'] @property @@ -110,9 +110,30 @@ def restore_from_state(self, state): self._next_action() def save_to_pickle(self, file): + """Write a pickled representation of the state of the trigger to the + open file-like object :attr:`file`. + + Args: + file: The open file-like object to which we write. As described in + pickle official document, it must have a `write()` method that + accepts a single string argument. + """ pickle.dump(self.state, file) def restore_from_pickle(self, file): + """Read a string from the open file-like object :attr:`file` and + restore the trigger state from it. + Note that this function will call `next(action)` for the exact times + that the :py:attr:`state` records how many times `next(action)` had + been called. The user should be aware of any possible side effect of + this behavior. + + Args: + file: The open file-like object from which we read. As described in + pickle official document, it must have a `read()` method that + takes an integer argument, and a `readline()` method that + requires no arguments, and both methods should return a string. + """ self.restore_from_state(pickle.load(file)) @@ -137,6 +158,25 @@ def _predicate(self, step): class BestEverConvergenceTrigger(Trigger): + """A trigger that maintains the best value of a metric. It triggers when + the best value of the metric has not been updated for at least + :attr:`threshold_steps`. In order to avoid it triggers two frequently, it + will not trigger again within :attr:`minimum_interval_steps` once it + triggers. + + Args: + action (iterable): An iterable which iteratively does the action and + possibly returns a value. + threshold_steps (int): Number of steps it should trigger after the best + value was last updated. + minimum_interval_steps (int): Minimum number of steps between twice + firing of the trigger. + default (optional): The value returned after :attr:`action` exhausted. + If not provided, the trigger will do nothing when `StopIteration` + occurs. + .. document private functions + .. automethod:: __call__ + """ def __init__(self, action, threshold_steps, minimum_interval_steps, default=DEFAULT_ACTION): @@ -160,6 +200,18 @@ def _predicate(self, step, score): return True return False + def __call__(self, step, score): + """The trigger must be called to update the current training step + (:attr:`step`) and the current value of the maintained metric + (:attr:`score`). + + Args: + step (int): Current training step to update. The training step must + be updated in ascending order. + score (float): Current value of the maintained metric. + """ + return super(BestEverConvergenceTrigger, self).__call__(step, score) + @property def _state_names(self): return super(BestEverConvergenceTrigger, self)._state_names + [ From 0d3e18755dc5b3be1537a126a5926d800432d17e Mon Sep 17 00:00:00 2001 From: wwt Date: Sun, 14 Oct 2018 21:33:40 -0400 Subject: [PATCH 33/65] update README.md --- examples/differentiable_expected_bleu/README.md | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/examples/differentiable_expected_bleu/README.md b/examples/differentiable_expected_bleu/README.md index 9b481c5e..eb7bcf9e 100644 --- a/examples/differentiable_expected_bleu/README.md +++ b/examples/differentiable_expected_bleu/README.md @@ -1,20 +1,14 @@ # Seq2seq Model # -This example builds an attentional seq2seq model for machine translation. - -## Usage ## +This example builds an attentional seq2seq model for machine translation trained with Differentiable Expected BLEU (DEBLEU) and Teacher Mask. See https://openreview.net/pdf?id=S1x2aiRqFX for the implemented paper. ### Dataset ### -Two example datasets are provided: - - * toy_copy: A small toy autoencoding dataset from [TF Seq2seq toolkit](https://github.com/google/seq2seq/tree/2500c26add91b079ca00cf1f091db5a99ddab9ae). * iwslt14: The benchmark [IWSLT2014](https://sites.google.com/site/iwsltevaluation2014/home) (de-en) machine translation dataset. Download the data with the following cmds: ``` -python prepare_data.py --data toy_copy python prepare_data.py --data iwslt14 ``` @@ -23,18 +17,17 @@ python prepare_data.py --data iwslt14 Train the model with the following cmd: ``` -python seq2seq_attn.py --config_model config_model --config_data config_toy_copy +python differentiable_expected_bleu.py --config_model config_model --config_data config_iwslt14 --config_train config_train --pretrain_epochs 8 ``` Here: * `--config_model` specifies the model config. Note not to include the `.py` suffix. * `--config_data` specifies the data config. + * `--config_train` specifies the training config. + * `--pretrain_epochs` specifies the number of epochs to pretrain with cross-entropy loss. [config_model.py](./config_model.py) specifies a single-layer seq2seq model with Luong attention and bi-directional RNN encoder. Hyperparameters taking default values can be omitted from the config file. -For demonstration purpose, [config_model_full.py](./config_model_full.py) gives all possible hyperparameters for the model. The two config files will lead to the same model. - ## Results ## -On the IWSLT14 dataset, using original target texts as reference(no `` in the reference), the model achieves `BLEU=21.66` within `10` epochs. - +On the IWSLT14 dataset, the model achieves `BLEU=25.35` after annealed all masks, while the cross-entropy trained model achieves `BLEU=24.57`. From b09578554f730f2f18cbce09e39cba26b5dae3a5 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sun, 14 Oct 2018 23:00:26 -0400 Subject: [PATCH 34/65] rename some filenames ; add val/test datasets --- .../differentiable_expected_bleu/README.md | 2 +- ...wslt14.py => config_data_iwslt14_de-en.py} | 17 ++++---- .../config_data_iwslt14_en-fr.py | 43 +++++++++++++++++++ ...train.py => config_train_iwslt14_de-en.py} | 2 +- .../differentiable_expected_bleu.py | 9 ++-- .../prepare_data.py | 17 +++----- 6 files changed, 65 insertions(+), 25 deletions(-) rename examples/differentiable_expected_bleu/{config_iwslt14.py => config_data_iwslt14_de-en.py} (63%) create mode 100644 examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py rename examples/differentiable_expected_bleu/{config_train.py => config_train_iwslt14_de-en.py} (96%) diff --git a/examples/differentiable_expected_bleu/README.md b/examples/differentiable_expected_bleu/README.md index eb7bcf9e..0e1d2ad2 100644 --- a/examples/differentiable_expected_bleu/README.md +++ b/examples/differentiable_expected_bleu/README.md @@ -9,7 +9,7 @@ This example builds an attentional seq2seq model for machine translation trained Download the data with the following cmds: ``` -python prepare_data.py --data iwslt14 +python prepare_data.py --data de-en ``` ### Train the model ### diff --git a/examples/differentiable_expected_bleu/config_iwslt14.py b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py similarity index 63% rename from examples/differentiable_expected_bleu/config_iwslt14.py rename to examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py index cfcc6d71..bbe3954b 100644 --- a/examples/differentiable_expected_bleu/config_iwslt14.py +++ b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py @@ -1,16 +1,16 @@ -source_vocab_file = 'data/iwslt14/vocab.de' -target_vocab_file = 'data/iwslt14/vocab.en' +source_vocab_file = 'data/iwslt14_de-en/vocab.de' +target_vocab_file = 'data/iwslt14_de-en/vocab.en' train = { 'batch_size': 80, 'allow_smaller_final_batch': False, 'source_dataset': { - "files": 'data/iwslt14/train.de', + "files": 'data/iwslt14_de-en/train.de', 'vocab_file': source_vocab_file, 'max_seq_length': 50 }, 'target_dataset': { - 'files': 'data/iwslt14/train.en', + 'files': 'data/iwslt14_de-en/train.en', 'vocab_file': target_vocab_file, 'max_seq_length': 50 }, @@ -19,11 +19,11 @@ 'batch_size': 80, 'shuffle': False, 'source_dataset': { - "files": 'data/iwslt14/valid.de', + "files": 'data/iwslt14_de-en/valid.de', 'vocab_file': source_vocab_file, }, 'target_dataset': { - 'files': 'data/iwslt14/valid.en', + 'files': 'data/iwslt14_de-en/valid.en', 'vocab_file': target_vocab_file, }, } @@ -31,12 +31,11 @@ 'batch_size': 80, 'shuffle': False, 'source_dataset': { - "files": 'data/iwslt14/test.de', + "files": 'data/iwslt14_de-en/test.de', 'vocab_file': source_vocab_file, }, 'target_dataset': { - 'files': 'data/iwslt14/test.en', + 'files': 'data/iwslt14_de-en/test.en', 'vocab_file': target_vocab_file, }, } -val = test diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py b/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py new file mode 100644 index 00000000..2ebe3b40 --- /dev/null +++ b/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py @@ -0,0 +1,43 @@ +source_vocab_file = 'data/iwslt14_en-fr/vocab.en' +target_vocab_file = 'data/iwslt14_en-fr/vocab.fr' + +train = { + 'batch_size': 80, + 'allow_smaller_final_batch': False, + 'source_dataset': { + "files": 'data/iwslt14_en-fr/train.en', + 'vocab_file': source_vocab_file, + 'max_seq_length': 50 + }, + 'target_dataset': { + 'files': 'data/iwslt14_en-fr/train.fr', + 'vocab_file': target_vocab_file, + 'max_seq_length': 50 + }, +} + +val = { + 'batch_size': 80, + 'shuffle': False, + 'source_dataset': { + "files": 'data/iwslt14_en-fr/valid.en', + 'vocab_file': source_vocab_file, + }, + 'target_dataset': { + 'files': 'data/iwslt14_en-fr/valid.fr', + 'vocab_file': target_vocab_file, + }, +} + +test = { + 'batch_size': 80, + 'shuffle': False, + 'source_dataset': { + "files": 'data/iwslt14_en-fr/test.en', + 'vocab_file': source_vocab_file, + }, + 'target_dataset': { + 'files': 'data/iwslt14_en-fr/test.fr', + 'vocab_file': target_vocab_file, + }, +} diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py similarity index 96% rename from examples/differentiable_expected_bleu/config_train.py rename to examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py index ccdde330..1e37fe9a 100644 --- a/examples/differentiable_expected_bleu/config_train.py +++ b/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py @@ -42,4 +42,4 @@ }, } -expr_name = 'train' +expr_name = 'iwslt14_de-en' diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index ee12b3dc..52bab578 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -27,9 +27,9 @@ flags = tf.flags -flags.DEFINE_string("config_train", "config_train", "The training config.") +flags.DEFINE_string("config_train", "config_train_iwslt14_de-en", "The training config.") flags.DEFINE_string("config_model", "config_model", "The model config.") -flags.DEFINE_string("config_data", "config_iwslt14", "The dataset config.") +flags.DEFINE_string("config_data", "config_data_iwslt14_de-en", "The dataset config.") flags.DEFINE_integer("pretrain_epochs", 8, "Number of pretraining epochs.") FLAGS = flags.FLAGS @@ -254,9 +254,10 @@ def _eval_epoch(sess, summary_writer, mode, trigger): epoch, ' (pretraining)' if pretraining else '')) val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger) + test_bleu = _eval_epoch(sess, summary_writer, 'test', trigger) step = tf.train.global_step(sess, global_step) - print('epoch: {}, step: {}, val bleu: {}'.format( - epoch, step, val_bleu)) + print('epoch: {}, step: {}, val bleu: {}, test bleu: {}'.format( + epoch, step, val_bleu, test_bleu)) if val_bleu > best_val_bleu: best_val_bleu = val_bleu diff --git a/examples/differentiable_expected_bleu/prepare_data.py b/examples/differentiable_expected_bleu/prepare_data.py index a5cc357b..a7557c09 100644 --- a/examples/differentiable_expected_bleu/prepare_data.py +++ b/examples/differentiable_expected_bleu/prepare_data.py @@ -16,31 +16,28 @@ import tensorflow as tf import texar as tx +import os + # pylint: disable=invalid-name flags = tf.flags -flags.DEFINE_string("data", "iwslt14", "Data to download [iwslt14|toy_copy]") +flags.DEFINE_string("data", "de-en", "Data to download [de-en|en-fr]") FLAGS = flags.FLAGS def prepare_data(): """Downloads data. """ - if FLAGS.data == 'iwslt14': + if FLAGS.data == 'de-en': tx.data.maybe_download( urls='https://drive.google.com/file/d/' '1Vuv3bed10qUxrpldHdYoiWLzPKa4pNXd/view?usp=sharing', path='./', - filenames='iwslt14.zip', - extract=True) - elif FLAGS.data == 'toy_copy': - tx.data.maybe_download( - urls='https://drive.google.com/file/d/' - '1fENE2rakm8vJ8d3voWBgW4hGlS6-KORW/view?usp=sharing', - path='./', - filenames='toy_copy.zip', + filenames='iwslt14_de-en.zip', extract=True) + os.rename(os.path.join('data', 'iwslt14'), + os.path.join('data', 'iwslt14_de-en')) else: raise ValueError('Unknown data: {}'.format(FLAGS.data)) From 06c572714c6afd7126d3ae574bd76ca012a7da28 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sun, 14 Oct 2018 23:20:04 -0400 Subject: [PATCH 35/65] add config_train_iwslt14_en-fr.py --- .../config_train_iwslt14_en-fr.py | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py diff --git a/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py b/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py new file mode 100644 index 00000000..e3751956 --- /dev/null +++ b/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py @@ -0,0 +1,45 @@ +max_epochs = 1000 +steps_per_eval = 500 +tau = 1. +infer_beam_width = 1 +infer_max_decoding_length = 50 + +mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] +threshold_steps = 10000 +minimum_interval_steps = 10000 + +train_xe = { + "optimizer": { + "type": "AdamOptimizer", + }, + "learning_rate_decay": { + "type": "piecewise_constant", + "kwargs": { + "boundaries": [10000], + "values": [1e-3, 1e-5], + }, + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, +} + +train_debleu = { + "optimizer": { + "type": "AdamOptimizer", + "kwargs": { + "learning_rate": 1e-5, + } + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, +} + +expr_name = 'iwslt14_en-fr' From 5305d3864dc1b5c6a0fbb1d766d2c2d7a82864e1 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sun, 14 Oct 2018 23:21:27 -0400 Subject: [PATCH 36/65] update README.md --- examples/differentiable_expected_bleu/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/differentiable_expected_bleu/README.md b/examples/differentiable_expected_bleu/README.md index 0e1d2ad2..ad5d685c 100644 --- a/examples/differentiable_expected_bleu/README.md +++ b/examples/differentiable_expected_bleu/README.md @@ -17,7 +17,7 @@ python prepare_data.py --data de-en Train the model with the following cmd: ``` -python differentiable_expected_bleu.py --config_model config_model --config_data config_iwslt14 --config_train config_train --pretrain_epochs 8 +python differentiable_expected_bleu.py --config_model config_model --config_data config_iwslt14_de-en --config_train config_train_iwslt14_de-en --pretrain_epochs 8 ``` Here: From 3aab0a624089004d478522a570953ed8746a608d Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Mon, 15 Oct 2018 20:08:36 -0400 Subject: [PATCH 37/65] replace moses bleu by nltk bleu --- .../differentiable_expected_bleu.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 52bab578..eeba8e28 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -25,6 +25,8 @@ import tensorflow as tf import texar as tx +from nltk.translate.bleu_score import corpus_bleu + flags = tf.flags flags.DEFINE_string("config_train", "config_train_iwslt14_de-en", "The training config.") @@ -181,9 +183,11 @@ def _eval_epoch(sess, summary_writer, mode, trigger): while True: try: target_texts_ori, output_ids = sess.run(fetches, feed_dict) - target_texts = tx.utils.strip_special_tokens(target_texts_ori) + target_texts = tx.utils.strip_special_tokens( + target_texts_ori.tolist(), is_token_list=True) output_texts = tx.utils.map_ids_to_strs( - ids=output_ids, vocab=val_data.target_vocab) + ids=output_ids.tolist(), vocab=val_data.target_vocab, + join=False) ref_hypo_pairs.extend( zip(map(lambda x: [x], target_texts), output_texts)) @@ -192,8 +196,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger): break refs, hypos = zip(*ref_hypo_pairs) - bleu = tx.evals.corpus_bleu_moses(list_of_references=refs, - hypotheses=hypos) + bleu = corpus_bleu(refs, hypos) * 100 print('{} BLEU: {}'.format(mode, bleu)) step = tf.train.global_step(sess, global_step) From 8ca85a9bcb9e85ea4ea36ddae69287b57d617a32 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Tue, 16 Oct 2018 02:11:10 -0400 Subject: [PATCH 38/65] modify model --- .../config_model.py | 12 ++++++++++- .../differentiable_expected_bleu.py | 20 +++++++++++++++++-- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model.py index 3ba0c867..16dba9b9 100644 --- a/examples/differentiable_expected_bleu/config_model.py +++ b/examples/differentiable_expected_bleu/config_model.py @@ -7,13 +7,23 @@ embedder = { 'dim': embedding_dim } + encoder = { 'rnn_cell_fw': { 'kwargs': { 'num_units': num_units - } + }, + 'num_layers': 2 + }, + 'output_layer_fw': { + 'dropout_rate': 0 } } + +connector = { + 'activation_fn': 'tanh' +} + decoder = { 'rnn_cell': { 'kwargs': { diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index eeba8e28..904410bd 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -54,7 +54,8 @@ def build_model(batch, train_data): encoder = tx.modules.BidirectionalRNNEncoder( hparams=config_model.encoder) - enc_outputs, _ = encoder(source_embedder(batch['source_text_ids'])) + enc_outputs, enc_final_state = encoder( + source_embedder(batch['source_text_ids'])) target_embedder = tx.modules.WordEmbedder( vocab_size=train_data.target_vocab.size, hparams=config_model.embedder) @@ -65,9 +66,23 @@ def build_model(batch, train_data): vocab_size=train_data.target_vocab.size, hparams=config_model.decoder) + enc_final_state = tf.contrib.framework.nest.map_structure( + lambda *args: tf.concat(args, -1), *enc_final_state) + + if isinstance(decoder.cell, tf.nn.rnn_cell.LSTMCell): + connector = tx.modules.MLPTransformConnector( + decoder.state_size.h, hparams=config_model.connector) + dec_initial_h = connector(enc_final_state.h) + dec_initial_state = (dec_initial_h, enc_final_state.c) + else: + connector = tx.modules.MLPTransformConnector( + decoder.state_size, hparams=config_model.connector) + dec_initial_state = connector(enc_final_state) + # cross-entropy + teacher-forcing pretraining tf_outputs, _, _ = decoder( decoding_strategy='train_greedy', + initial_state=dec_initial_state, inputs=target_embedder(batch['target_text_ids'][:, :-1]), sequence_length=batch['target_length']-1) @@ -91,7 +106,8 @@ def build_model(batch, train_data): tau=config_train.tau) tm_outputs, _, _ = decoder( - helper=tm_helper) + helper=tm_helper, + initial_state=dec_initial_state) loss_debleu = tx.losses.debleu( labels=batch['target_text_ids'][:, 1:], From 78b6994ce222f4598b7ad7605ba9125290e41dca Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Tue, 16 Oct 2018 22:56:51 -0400 Subject: [PATCH 39/65] refine models --- .../config_model.py | 4 --- .../config_train_iwslt14_de-en.py | 2 -- .../config_train_iwslt14_en-fr.py | 2 -- .../differentiable_expected_bleu.py | 31 ++++++------------- 4 files changed, 9 insertions(+), 30 deletions(-) diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model.py index 16dba9b9..3b7a8da7 100644 --- a/examples/differentiable_expected_bleu/config_model.py +++ b/examples/differentiable_expected_bleu/config_model.py @@ -20,10 +20,6 @@ } } -connector = { - 'activation_fn': 'tanh' -} - decoder = { 'rnn_cell': { 'kwargs': { diff --git a/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py b/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py index 1e37fe9a..2b057887 100644 --- a/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py +++ b/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py @@ -41,5 +41,3 @@ }, }, } - -expr_name = 'iwslt14_de-en' diff --git a/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py b/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py index e3751956..2b057887 100644 --- a/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py +++ b/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py @@ -41,5 +41,3 @@ }, }, } - -expr_name = 'iwslt14_en-fr' diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 904410bd..efe1e255 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -29,9 +29,13 @@ flags = tf.flags -flags.DEFINE_string("config_train", "config_train_iwslt14_de-en", "The training config.") +flags.DEFINE_string("config_train", "config_train_iwslt14_de-en", + "The training config.") flags.DEFINE_string("config_model", "config_model", "The model config.") -flags.DEFINE_string("config_data", "config_data_iwslt14_de-en", "The dataset config.") +flags.DEFINE_string("config_data", "config_data_iwslt14_de-en", + "The dataset config.") +flags.DEFINE_string("expr_name", "iwslt14_de-en", "The experiment name. " + "Also used as the directory name of run.") flags.DEFINE_integer("pretrain_epochs", 8, "Number of pretraining epochs.") FLAGS = flags.FLAGS @@ -39,9 +43,8 @@ config_train = importlib.import_module(FLAGS.config_train) config_model = importlib.import_module(FLAGS.config_model) config_data = importlib.import_module(FLAGS.config_data) +expr_name = FLAGS.expr_name pretrain_epochs = FLAGS.pretrain_epochs - -expr_name = config_train.expr_name mask_patterns = config_train.mask_patterns @@ -54,8 +57,7 @@ def build_model(batch, train_data): encoder = tx.modules.BidirectionalRNNEncoder( hparams=config_model.encoder) - enc_outputs, enc_final_state = encoder( - source_embedder(batch['source_text_ids'])) + enc_outputs, _ = encoder(source_embedder(batch['source_text_ids'])) target_embedder = tx.modules.WordEmbedder( vocab_size=train_data.target_vocab.size, hparams=config_model.embedder) @@ -66,23 +68,9 @@ def build_model(batch, train_data): vocab_size=train_data.target_vocab.size, hparams=config_model.decoder) - enc_final_state = tf.contrib.framework.nest.map_structure( - lambda *args: tf.concat(args, -1), *enc_final_state) - - if isinstance(decoder.cell, tf.nn.rnn_cell.LSTMCell): - connector = tx.modules.MLPTransformConnector( - decoder.state_size.h, hparams=config_model.connector) - dec_initial_h = connector(enc_final_state.h) - dec_initial_state = (dec_initial_h, enc_final_state.c) - else: - connector = tx.modules.MLPTransformConnector( - decoder.state_size, hparams=config_model.connector) - dec_initial_state = connector(enc_final_state) - # cross-entropy + teacher-forcing pretraining tf_outputs, _, _ = decoder( decoding_strategy='train_greedy', - initial_state=dec_initial_state, inputs=target_embedder(batch['target_text_ids'][:, :-1]), sequence_length=batch['target_length']-1) @@ -106,8 +94,7 @@ def build_model(batch, train_data): tau=config_train.tau) tm_outputs, _, _ = decoder( - helper=tm_helper, - initial_state=dec_initial_state) + helper=tm_helper) loss_debleu = tx.losses.debleu( labels=batch['target_text_ids'][:, 1:], From 82bc6a8561f792e4fb6ad8f2ed5a3108b2c9b5d3 Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Wed, 17 Oct 2018 23:31:10 -0400 Subject: [PATCH 40/65] refine summary ; batch_size=160 --- .../config_data_iwslt14_de-en.py | 8 ++-- .../config_data_iwslt14_en-fr.py | 8 ++-- .../config_train_expd1e-2_xe.py | 46 +++++++++++++++++++ .../config_train_expd2e-2_xe.py | 46 +++++++++++++++++++ .../config_train_expd5e-3_xe.py | 46 +++++++++++++++++++ .../differentiable_expected_bleu.py | 29 ++++++++---- 6 files changed, 168 insertions(+), 15 deletions(-) create mode 100644 examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py create mode 100644 examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py create mode 100644 examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py index bbe3954b..fb03a8bb 100644 --- a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py +++ b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py @@ -1,8 +1,10 @@ source_vocab_file = 'data/iwslt14_de-en/vocab.de' target_vocab_file = 'data/iwslt14_de-en/vocab.en' +batch_size = 160 + train = { - 'batch_size': 80, + 'batch_size': batch_size, 'allow_smaller_final_batch': False, 'source_dataset': { "files": 'data/iwslt14_de-en/train.de', @@ -16,7 +18,7 @@ }, } val = { - 'batch_size': 80, + 'batch_size': batch_size, 'shuffle': False, 'source_dataset': { "files": 'data/iwslt14_de-en/valid.de', @@ -28,7 +30,7 @@ }, } test = { - 'batch_size': 80, + 'batch_size': batch_size, 'shuffle': False, 'source_dataset': { "files": 'data/iwslt14_de-en/test.de', diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py b/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py index 2ebe3b40..a81090e6 100644 --- a/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py +++ b/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py @@ -1,8 +1,10 @@ source_vocab_file = 'data/iwslt14_en-fr/vocab.en' target_vocab_file = 'data/iwslt14_en-fr/vocab.fr' +batch_size = 160 + train = { - 'batch_size': 80, + 'batch_size': batch_size, 'allow_smaller_final_batch': False, 'source_dataset': { "files": 'data/iwslt14_en-fr/train.en', @@ -17,7 +19,7 @@ } val = { - 'batch_size': 80, + 'batch_size': batch_size, 'shuffle': False, 'source_dataset': { "files": 'data/iwslt14_en-fr/valid.en', @@ -30,7 +32,7 @@ } test = { - 'batch_size': 80, + 'batch_size': batch_size, 'shuffle': False, 'source_dataset': { "files": 'data/iwslt14_en-fr/test.en', diff --git a/examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py b/examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py new file mode 100644 index 00000000..69331564 --- /dev/null +++ b/examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py @@ -0,0 +1,46 @@ +max_epochs = 1000 +steps_per_eval = 500 +tau = 1. +infer_beam_width = 1 +infer_max_decoding_length = 50 + +mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] +threshold_steps = 10000 +minimum_interval_steps = 10000 + +train_xe = { + "optimizer": { + "type": "AdamOptimizer", + }, + "learning_rate_decay": { + "type": "exponential_decay", + "kwargs": { + "decay_steps": 10000, + "decay_rate": 1e-2, + }, + "min_learning_rate": 1e-5, + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, + "name": "XE" +} + +train_debleu = { + "optimizer": { + "type": "AdamOptimizer", + "kwargs": { + "learning_rate": 1e-5, + } + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, + "name": "DEBLEU" +} diff --git a/examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py b/examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py new file mode 100644 index 00000000..d5e9759a --- /dev/null +++ b/examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py @@ -0,0 +1,46 @@ +max_epochs = 1000 +steps_per_eval = 500 +tau = 1. +infer_beam_width = 1 +infer_max_decoding_length = 50 + +mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] +threshold_steps = 10000 +minimum_interval_steps = 10000 + +train_xe = { + "optimizer": { + "type": "AdamOptimizer", + }, + "learning_rate_decay": { + "type": "exponential_decay", + "kwargs": { + "decay_steps": 10000, + "decay_rate": 2e-2, + }, + "min_learning_rate": 1e-5, + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, + "name": "XE" +} + +train_debleu = { + "optimizer": { + "type": "AdamOptimizer", + "kwargs": { + "learning_rate": 1e-5, + } + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, + "name": "DEBLEU" +} diff --git a/examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py b/examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py new file mode 100644 index 00000000..613369fd --- /dev/null +++ b/examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py @@ -0,0 +1,46 @@ +max_epochs = 1000 +steps_per_eval = 500 +tau = 1. +infer_beam_width = 1 +infer_max_decoding_length = 50 + +mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] +threshold_steps = 10000 +minimum_interval_steps = 10000 + +train_xe = { + "optimizer": { + "type": "AdamOptimizer", + }, + "learning_rate_decay": { + "type": "exponential_decay", + "kwargs": { + "decay_steps": 10000, + "decay_rate": 5e-3, + }, + "min_learning_rate": 1e-5, + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, + "name": "XE" +} + +train_debleu = { + "optimizer": { + "type": "AdamOptimizer", + "kwargs": { + "learning_rate": 1e-5, + } + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, + "name": "DEBLEU" +} diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index efe1e255..5e052096 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -137,14 +137,23 @@ def main(): train_xe_op, train_debleu_op, tm_helper, infer_outputs = \ build_model(data_batch, train_data) - tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask) - tf.summary.scalar('tm/n_mask', tm_helper.n_mask) - - merged_summary = tf.summary.merge_all() + summary_tm = [ + tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask), + tf.summary.scalar('tm/n_mask', tm_helper.n_mask)] + summary_xe_op = tf.summary.merge( + tf.get_collection( + tf.GraphKeys.SUMMARIES, + scope='/'.join(train_xe_op.name.split('/')[:-1])), + name='summary_xe') + summary_debleu_op = tf.summary.merge( + tf.get_collection( + tf.GraphKeys.SUMMARIES, + scope='/'.join(train_xe_op.name.split('/')[:-1])) + summary_tm, + name='summary_debleu') saver = tf.train.Saver(max_to_keep=None) - def _train_epoch(sess, summary_writer, train_op, trigger): + def _train_epoch(sess, summary_writer, train_op, summary_op, trigger): print('in _train_epoch') data_iterator.restart_dataset(sess, 'train') @@ -156,7 +165,7 @@ def _train_epoch(sess, summary_writer, train_op, trigger): while True: try: loss, summary, step = sess.run( - (train_op, merged_summary, global_step), feed_dict) + (train_op, summary_op, global_step), feed_dict) summary_writer.add_summary(summary, step) @@ -279,9 +288,11 @@ def _eval_epoch(sess, summary_writer, mode, trigger): print('saved to {}'.format(saved_path)) - train_op = train_xe_op if pretraining else train_debleu_op - _train_epoch(sess, summary_writer, train_op, - None if pretraining else trigger) + train_op, summary_op, trigger_ = { + True: (train_xe_op, summary_xe_op, None), + False: (train_debleu_op, summary_debleu_op, trigger) + }[pretraining] + _train_epoch(sess, summary_writer, train_op, summary_op, trigger_) epoch += 1 step = tf.train.global_step(sess, global_step) From fffd6486b618d1587ed11584c6dd48c58f677fa0 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 18 Oct 2018 19:37:16 +0000 Subject: [PATCH 41/65] remove exponetial decay configs ; fix summary bug --- .../config_train_expd1e-2_xe.py | 46 ------------------- .../config_train_expd2e-2_xe.py | 46 ------------------- .../config_train_expd5e-3_xe.py | 46 ------------------- .../config_train_iwslt14_de-en.py | 4 +- .../config_train_iwslt14_en-fr.py | 4 +- .../differentiable_expected_bleu.py | 8 +++- 6 files changed, 12 insertions(+), 142 deletions(-) delete mode 100644 examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py delete mode 100644 examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py delete mode 100644 examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py diff --git a/examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py b/examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py deleted file mode 100644 index 69331564..00000000 --- a/examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py +++ /dev/null @@ -1,46 +0,0 @@ -max_epochs = 1000 -steps_per_eval = 500 -tau = 1. -infer_beam_width = 1 -infer_max_decoding_length = 50 - -mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] -threshold_steps = 10000 -minimum_interval_steps = 10000 - -train_xe = { - "optimizer": { - "type": "AdamOptimizer", - }, - "learning_rate_decay": { - "type": "exponential_decay", - "kwargs": { - "decay_steps": 10000, - "decay_rate": 1e-2, - }, - "min_learning_rate": 1e-5, - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "XE" -} - -train_debleu = { - "optimizer": { - "type": "AdamOptimizer", - "kwargs": { - "learning_rate": 1e-5, - } - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "DEBLEU" -} diff --git a/examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py b/examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py deleted file mode 100644 index d5e9759a..00000000 --- a/examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py +++ /dev/null @@ -1,46 +0,0 @@ -max_epochs = 1000 -steps_per_eval = 500 -tau = 1. -infer_beam_width = 1 -infer_max_decoding_length = 50 - -mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] -threshold_steps = 10000 -minimum_interval_steps = 10000 - -train_xe = { - "optimizer": { - "type": "AdamOptimizer", - }, - "learning_rate_decay": { - "type": "exponential_decay", - "kwargs": { - "decay_steps": 10000, - "decay_rate": 2e-2, - }, - "min_learning_rate": 1e-5, - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "XE" -} - -train_debleu = { - "optimizer": { - "type": "AdamOptimizer", - "kwargs": { - "learning_rate": 1e-5, - } - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "DEBLEU" -} diff --git a/examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py b/examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py deleted file mode 100644 index 613369fd..00000000 --- a/examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py +++ /dev/null @@ -1,46 +0,0 @@ -max_epochs = 1000 -steps_per_eval = 500 -tau = 1. -infer_beam_width = 1 -infer_max_decoding_length = 50 - -mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] -threshold_steps = 10000 -minimum_interval_steps = 10000 - -train_xe = { - "optimizer": { - "type": "AdamOptimizer", - }, - "learning_rate_decay": { - "type": "exponential_decay", - "kwargs": { - "decay_steps": 10000, - "decay_rate": 5e-3, - }, - "min_learning_rate": 1e-5, - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "XE" -} - -train_debleu = { - "optimizer": { - "type": "AdamOptimizer", - "kwargs": { - "learning_rate": 1e-5, - } - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "DEBLEU" -} diff --git a/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py b/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py index 2b057887..3ce1d904 100644 --- a/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py +++ b/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py @@ -15,7 +15,7 @@ "learning_rate_decay": { "type": "piecewise_constant", "kwargs": { - "boundaries": [10000], + "boundaries": [100000], "values": [1e-3, 1e-5], }, }, @@ -25,6 +25,7 @@ "clip_norm": 5. }, }, + "name": "XE" } train_debleu = { @@ -40,4 +41,5 @@ "clip_norm": 5. }, }, + "name": "DEBLEU" } diff --git a/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py b/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py index 2b057887..3ce1d904 100644 --- a/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py +++ b/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py @@ -15,7 +15,7 @@ "learning_rate_decay": { "type": "piecewise_constant", "kwargs": { - "boundaries": [10000], + "boundaries": [100000], "values": [1e-3, 1e-5], }, }, @@ -25,6 +25,7 @@ "clip_norm": 5. }, }, + "name": "XE" } train_debleu = { @@ -40,4 +41,5 @@ "clip_norm": 5. }, }, + "name": "DEBLEU" } diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 5e052096..dc924bc8 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -48,6 +48,10 @@ mask_patterns = config_train.mask_patterns +def get_scope_by_name(tensor): + return tensor.name[: tensor.name.rfind('/') + 1] + + def build_model(batch, train_data): """Assembles the seq2seq model. """ @@ -143,12 +147,12 @@ def main(): summary_xe_op = tf.summary.merge( tf.get_collection( tf.GraphKeys.SUMMARIES, - scope='/'.join(train_xe_op.name.split('/')[:-1])), + scope=get_scope_by_name(train_xe_op)), name='summary_xe') summary_debleu_op = tf.summary.merge( tf.get_collection( tf.GraphKeys.SUMMARIES, - scope='/'.join(train_xe_op.name.split('/')[:-1])) + summary_tm, + scope=get_scope_by_name(train_debleu_op)) + summary_tm, name='summary_debleu') saver = tf.train.Saver(max_to_keep=None) From 6d07aa16c821f9f3bfdb9f152b3a6aa600c5d15d Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Fri, 19 Oct 2018 02:36:43 +0000 Subject: [PATCH 42/65] add stages --- .../config_en-fr_xe_1e3_xe_1e5_debleu.py | 41 ----------------- .../config_train_iwslt14_de-en.py | 45 ------------------- .../config_train_iwslt14_en-fr.py | 45 ------------------- .../differentiable_expected_bleu.py | 39 ++++++++++------ 4 files changed, 25 insertions(+), 145 deletions(-) delete mode 100644 examples/differentiable_expected_bleu/config_en-fr_xe_1e3_xe_1e5_debleu.py delete mode 100644 examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py delete mode 100644 examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py diff --git a/examples/differentiable_expected_bleu/config_en-fr_xe_1e3_xe_1e5_debleu.py b/examples/differentiable_expected_bleu/config_en-fr_xe_1e3_xe_1e5_debleu.py deleted file mode 100644 index 07acbea8..00000000 --- a/examples/differentiable_expected_bleu/config_en-fr_xe_1e3_xe_1e5_debleu.py +++ /dev/null @@ -1,41 +0,0 @@ -max_epochs = 1000 -steps_per_eval = 500 -tau = 1. -infer_beam_width = 1 -infer_max_decoding_length = 50 - -mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] -threshold_steps = 10000 -wait_steps = 10000 - -train_xe = { - "optimizer": { - "type": "AdamOptimizer", - "kwargs": { - "learning_rate": 1e-5, - } - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, -} - -train_debleu = { - "optimizer": { - "type": "AdamOptimizer", - "kwargs": { - "learning_rate": 1e-5, - } - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, -} - -expr_name = 'en-fr_xe_1e3_xe_1e5_debleu' diff --git a/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py b/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py deleted file mode 100644 index 3ce1d904..00000000 --- a/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py +++ /dev/null @@ -1,45 +0,0 @@ -max_epochs = 1000 -steps_per_eval = 500 -tau = 1. -infer_beam_width = 1 -infer_max_decoding_length = 50 - -mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] -threshold_steps = 10000 -minimum_interval_steps = 10000 - -train_xe = { - "optimizer": { - "type": "AdamOptimizer", - }, - "learning_rate_decay": { - "type": "piecewise_constant", - "kwargs": { - "boundaries": [100000], - "values": [1e-3, 1e-5], - }, - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "XE" -} - -train_debleu = { - "optimizer": { - "type": "AdamOptimizer", - "kwargs": { - "learning_rate": 1e-5, - } - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "DEBLEU" -} diff --git a/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py b/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py deleted file mode 100644 index 3ce1d904..00000000 --- a/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py +++ /dev/null @@ -1,45 +0,0 @@ -max_epochs = 1000 -steps_per_eval = 500 -tau = 1. -infer_beam_width = 1 -infer_max_decoding_length = 50 - -mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] -threshold_steps = 10000 -minimum_interval_steps = 10000 - -train_xe = { - "optimizer": { - "type": "AdamOptimizer", - }, - "learning_rate_decay": { - "type": "piecewise_constant", - "kwargs": { - "boundaries": [100000], - "values": [1e-3, 1e-5], - }, - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "XE" -} - -train_debleu = { - "optimizer": { - "type": "AdamOptimizer", - "kwargs": { - "learning_rate": 1e-5, - } - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "DEBLEU" -} diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index dc924bc8..15e1991b 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -36,7 +36,8 @@ "The dataset config.") flags.DEFINE_string("expr_name", "iwslt14_de-en", "The experiment name. " "Also used as the directory name of run.") -flags.DEFINE_integer("pretrain_epochs", 8, "Number of pretraining epochs.") +flags.DEFINE_integer("pretrain_epochs", 10000, "Number of pretraining epochs.") +flags.DEFINE_string("stage", "xe0", "stage.") FLAGS = flags.FLAGS @@ -45,6 +46,7 @@ config_data = importlib.import_module(FLAGS.config_data) expr_name = FLAGS.expr_name pretrain_epochs = FLAGS.pretrain_epochs +stage = FLAGS.stage mask_patterns = config_train.mask_patterns @@ -83,9 +85,13 @@ def build_model(batch, train_data): logits=tf_outputs.logits, sequence_length=batch['target_length']-1) - train_xe_op = tx.core.get_train_op( + train_xe0_op = tx.core.get_train_op( loss_xe, - hparams=config_train.train_xe) + hparams=config_train.train_xe0) + + train_xe1_op = tx.core.get_train_op( + loss_xe, + hparams=config_train.train_xe1) # teacher mask + DEBLEU fine-tuning tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper( @@ -122,7 +128,7 @@ def build_model(batch, train_data): beam_width=config_train.infer_beam_width, max_decoding_length=config_train.infer_max_decoding_length) - return train_xe_op, train_debleu_op, tm_helper, bs_outputs + return train_xe0_op, train_xe1_op, train_debleu_op, tm_helper, bs_outputs def main(): @@ -138,16 +144,21 @@ def main(): global_step = tf.train.create_global_step() - train_xe_op, train_debleu_op, tm_helper, infer_outputs = \ + train_xe0_op, train_xe1_op, train_debleu_op, tm_helper, infer_outputs = \ build_model(data_batch, train_data) summary_tm = [ tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask), tf.summary.scalar('tm/n_mask', tm_helper.n_mask)] - summary_xe_op = tf.summary.merge( + summary_xe0_op = tf.summary.merge( + tf.get_collection( + tf.GraphKeys.SUMMARIES, + scope=get_scope_by_name(train_xe0_op)), + name='summary_xe') + summary_xe1_op = tf.summary.merge( tf.get_collection( tf.GraphKeys.SUMMARIES, - scope=get_scope_by_name(train_xe_op)), + scope=get_scope_by_name(train_xe1_op)), name='summary_xe') summary_debleu_op = tf.summary.merge( tf.get_collection( @@ -268,9 +279,8 @@ def _eval_epoch(sess, summary_writer, mode, trigger): epoch = 0 while epoch < config_train.max_epochs: - pretraining = epoch < pretrain_epochs print('epoch #{}{}:'.format( - epoch, ' (pretraining)' if pretraining else '')) + epoch, ' ({})'.format(stage))) val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger) test_bleu = _eval_epoch(sess, summary_writer, 'test', trigger) @@ -285,7 +295,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger): saved_path = saver.save( sess, ckpt_best, global_step=step) - if not pretraining: + if stage == 'debleu': with open('{}.trigger'.format(saved_path), 'w') as \ pickle_file: trigger.save_to_pickle(pickle_file) @@ -293,16 +303,17 @@ def _eval_epoch(sess, summary_writer, mode, trigger): print('saved to {}'.format(saved_path)) train_op, summary_op, trigger_ = { - True: (train_xe_op, summary_xe_op, None), - False: (train_debleu_op, summary_debleu_op, trigger) - }[pretraining] + 'xe0': (train_xe0_op, summary_xe0_op, None), + 'xe1': (train_xe1_op, summary_xe1_op, None), + 'debleu': (train_debleu_op, summary_debleu_op, trigger) + }[stage] _train_epoch(sess, summary_writer, train_op, summary_op, trigger_) epoch += 1 step = tf.train.global_step(sess, global_step) saved_path = saver.save(sess, ckpt_model, global_step=step) - if not pretraining: + if stage == 'debleu': with open('{}.trigger'.format(saved_path), 'w') as pickle_file: trigger.save_to_pickle(pickle_file) From 923ea8cab258ff43555e468358bc2734b186649b Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Fri, 19 Oct 2018 02:38:32 +0000 Subject: [PATCH 43/65] add config_train --- .../config_train.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 examples/differentiable_expected_bleu/config_train.py diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py new file mode 100644 index 00000000..1d55dd40 --- /dev/null +++ b/examples/differentiable_expected_bleu/config_train.py @@ -0,0 +1,57 @@ +max_epochs = 1000 +steps_per_eval = 500 +tau = 1. +infer_beam_width = 1 +infer_max_decoding_length = 50 + +mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] +threshold_steps = 10000 +minimum_interval_steps = 10000 + +train_xe0 = { + "optimizer": { + "type": "AdamOptimizer", + "kwargs": { + "learning_rate": 1e-3 + } + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, + "name": "XE_0" +} + +train_xe1 = { + "optimizer": { + "type": "AdamOptimizer", + "kwargs": { + "learning_rate": 1e-5 + } + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, + "name": "XE_1" +} + +train_debleu = { + "optimizer": { + "type": "AdamOptimizer", + "kwargs": { + "learning_rate": 1e-5, + } + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, + "name": "DEBLEU" +} From 56b44c7cc7e7ccce000692e14922c4127daadfaa Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Fri, 19 Oct 2018 22:41:44 -0400 Subject: [PATCH 44/65] modify 2-layer encoder to 1-layer --- examples/differentiable_expected_bleu/config_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model.py index 3b7a8da7..55afef49 100644 --- a/examples/differentiable_expected_bleu/config_model.py +++ b/examples/differentiable_expected_bleu/config_model.py @@ -13,7 +13,6 @@ 'kwargs': { 'num_units': num_units }, - 'num_layers': 2 }, 'output_layer_fw': { 'dropout_rate': 0 From c6991c8027778b214c65df8dadcb3ac0c1ea3b9b Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sat, 20 Oct 2018 01:25:57 -0400 Subject: [PATCH 45/65] change configs to bowen's --- .../config_data_iwslt14_de-en.py | 2 ++ examples/differentiable_expected_bleu/config_model.py | 7 ++----- examples/differentiable_expected_bleu/config_train.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py index fb03a8bb..cf0c645f 100644 --- a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py +++ b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py @@ -17,6 +17,7 @@ 'max_seq_length': 50 }, } + val = { 'batch_size': batch_size, 'shuffle': False, @@ -29,6 +30,7 @@ 'vocab_file': target_vocab_file, }, } + test = { 'batch_size': batch_size, 'shuffle': False, diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model.py index 55afef49..125b1fc6 100644 --- a/examples/differentiable_expected_bleu/config_model.py +++ b/examples/differentiable_expected_bleu/config_model.py @@ -1,8 +1,8 @@ # Attentional Seq2seq model. # Hyperparameters not specified here will take the default values. -num_units = 1000 -embedding_dim = 500 +num_units = 256 +embedding_dim = 256 embedder = { 'dim': embedding_dim @@ -14,9 +14,6 @@ 'num_units': num_units }, }, - 'output_layer_fw': { - 'dropout_rate': 0 - } } decoder = { diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py index 1d55dd40..2ecf2210 100644 --- a/examples/differentiable_expected_bleu/config_train.py +++ b/examples/differentiable_expected_bleu/config_train.py @@ -1,7 +1,7 @@ max_epochs = 1000 steps_per_eval = 500 tau = 1. -infer_beam_width = 1 +infer_beam_width = 10 infer_max_decoding_length = 50 mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] From 7e89acf59c04569e0a0f435f5e940f3a8168cc7f Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Sat, 20 Oct 2018 17:48:18 -0400 Subject: [PATCH 46/65] open trigger file in binary mode --- .../differentiable_expected_bleu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 15e1991b..7a4744d4 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -267,7 +267,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger): trigger_path = '{}.trigger'.format(ckpt_path) if os.path.exists(trigger_path): - with open(trigger_path, 'r') as pickle_file: + with open(trigger_path, 'rb') as pickle_file: trigger.restore_from_pickle(pickle_file) else: print('cannot find previous trigger state.') @@ -296,7 +296,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger): sess, ckpt_best, global_step=step) if stage == 'debleu': - with open('{}.trigger'.format(saved_path), 'w') as \ + with open('{}.trigger'.format(saved_path), 'wb') as \ pickle_file: trigger.save_to_pickle(pickle_file) From de78471aa4badf0f0406251ebd40b80b126c1cf5 Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Sat, 20 Oct 2018 18:18:41 -0400 Subject: [PATCH 47/65] add binary mode --- .../differentiable_expected_bleu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 7a4744d4..7529afe8 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -314,7 +314,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger): saved_path = saver.save(sess, ckpt_model, global_step=step) if stage == 'debleu': - with open('{}.trigger'.format(saved_path), 'w') as pickle_file: + with open('{}.trigger'.format(saved_path), 'wb') as pickle_file: trigger.save_to_pickle(pickle_file) print('saved to {}'.format(saved_path)) From b822f393e9dbeb215f8c3225c8fe8ca4ced5adde Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Sun, 21 Oct 2018 22:37:13 -0400 Subject: [PATCH 48/65] use new datasets ; reinitialize optimizer when annealing ; modify config_train.py ; replace tf.Variable by tf.get_variable in TeacherMaskTrainingHelper --- .../config_train.py | 26 ++------- .../differentiable_expected_bleu.py | 58 ++++++++++++------- .../prepare_data.py | 2 +- texar/modules/decoders/rnn_decoder_helpers.py | 10 ++-- 4 files changed, 49 insertions(+), 47 deletions(-) diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py index 2ecf2210..bb80f0ca 100644 --- a/examples/differentiable_expected_bleu/config_train.py +++ b/examples/differentiable_expected_bleu/config_train.py @@ -1,18 +1,18 @@ max_epochs = 1000 -steps_per_eval = 500 +steps_per_eval = int(1e9) tau = 1. infer_beam_width = 10 infer_max_decoding_length = 50 mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] -threshold_steps = 10000 +threshold_steps = 25000 minimum_interval_steps = 10000 -train_xe0 = { +train_xe = { "optimizer": { "type": "AdamOptimizer", "kwargs": { - "learning_rate": 1e-3 + "learning_rate": [1e-3, 1e-5] } }, "gradient_clip": { @@ -21,23 +21,7 @@ "clip_norm": 5. }, }, - "name": "XE_0" -} - -train_xe1 = { - "optimizer": { - "type": "AdamOptimizer", - "kwargs": { - "learning_rate": 1e-5 - } - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "XE_1" + "name": "XE" } train_debleu = { diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 7529afe8..90e26172 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -29,7 +29,7 @@ flags = tf.flags -flags.DEFINE_string("config_train", "config_train_iwslt14_de-en", +flags.DEFINE_string("config_train", "config_train", "The training config.") flags.DEFINE_string("config_model", "config_model", "The model config.") flags.DEFINE_string("config_data", "config_data_iwslt14_de-en", @@ -38,6 +38,8 @@ "Also used as the directory name of run.") flags.DEFINE_integer("pretrain_epochs", 10000, "Number of pretraining epochs.") flags.DEFINE_string("stage", "xe0", "stage.") +flags.DEFINE_boolean("reinitialize_optimizer", False, "Whether to reinitialize " + "optimizer state before training.") FLAGS = flags.FLAGS @@ -47,8 +49,13 @@ expr_name = FLAGS.expr_name pretrain_epochs = FLAGS.pretrain_epochs stage = FLAGS.stage +reinitialize_optimizer = FLAGS.reinitialize_optimizer mask_patterns = config_train.mask_patterns +if stage.startswith("xe"): + d = config_train.train_xe["optimizer"]["kwargs"] + d["learning_rate"] = d["learning_rate"][int(stage[2:])] + def get_scope_by_name(tensor): return tensor.name[: tensor.name.rfind('/') + 1] @@ -85,13 +92,9 @@ def build_model(batch, train_data): logits=tf_outputs.logits, sequence_length=batch['target_length']-1) - train_xe0_op = tx.core.get_train_op( - loss_xe, - hparams=config_train.train_xe0) - - train_xe1_op = tx.core.get_train_op( + train_xe_op = tx.core.get_train_op( loss_xe, - hparams=config_train.train_xe1) + hparams=config_train.train_xe) # teacher mask + DEBLEU fine-tuning tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper( @@ -128,7 +131,7 @@ def build_model(batch, train_data): beam_width=config_train.infer_beam_width, max_decoding_length=config_train.infer_max_decoding_length) - return train_xe0_op, train_xe1_op, train_debleu_op, tm_helper, bs_outputs + return train_xe_op, train_debleu_op, tm_helper, bs_outputs def main(): @@ -144,21 +147,26 @@ def main(): global_step = tf.train.create_global_step() - train_xe0_op, train_xe1_op, train_debleu_op, tm_helper, infer_outputs = \ + train_xe_op, train_debleu_op, tm_helper, infer_outputs = \ build_model(data_batch, train_data) + train_xe_op_initializer, train_debleu_op_initializer = [ + tf.variables_initializer( + tf.get_collection( + tf.GraphKeys.GLOBAL_VARIABLES, + scope=get_scope_by_name(train_op)), + name=name) + for train_op, name in [ + (train_xe_op, "train_xe_op_initializer"), + (train_debleu_op, "train_debleu_op_initializer")]] + summary_tm = [ tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask), tf.summary.scalar('tm/n_mask', tm_helper.n_mask)] - summary_xe0_op = tf.summary.merge( + summary_xe_op = tf.summary.merge( tf.get_collection( tf.GraphKeys.SUMMARIES, - scope=get_scope_by_name(train_xe0_op)), - name='summary_xe') - summary_xe1_op = tf.summary.merge( - tf.get_collection( - tf.GraphKeys.SUMMARIES, - scope=get_scope_by_name(train_xe1_op)), + scope=get_scope_by_name(train_xe_op)), name='summary_xe') summary_debleu_op = tf.summary.merge( tf.get_collection( @@ -243,8 +251,12 @@ def _eval_epoch(sess, summary_writer, mode, trigger): best_val_bleu = -1 with tf.Session() as sess: - action = (tm_helper.assign_mask_pattern(sess, n_unmask, n_mask) - for n_unmask, n_mask in mask_patterns[1:]) + def action_of_mask(mask_pattern): + sess.run(train_debleu_op_initializer) + tm_helper.assign_mask_pattern(sess, *mask_pattern) + + action = (action_of_mask(mask_pattern) + for mask_pattern in mask_patterns[1:]) trigger = tx.utils.BestEverConvergenceTrigger( action, config_train.threshold_steps, @@ -265,6 +277,10 @@ def _eval_epoch(sess, summary_writer, mode, trigger): print('restoring from {} ...'.format(ckpt_path)) saver.restore(sess, ckpt_path) + if reinitialize_optimizer: + sess.run(train_xe_op_initializer) + sess.run(train_debleu_op_initializer) + trigger_path = '{}.trigger'.format(ckpt_path) if os.path.exists(trigger_path): with open(trigger_path, 'rb') as pickle_file: @@ -283,7 +299,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger): epoch, ' ({})'.format(stage))) val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger) - test_bleu = _eval_epoch(sess, summary_writer, 'test', trigger) + test_bleu = _eval_epoch(sess, summary_writer, 'test', None) step = tf.train.global_step(sess, global_step) print('epoch: {}, step: {}, val bleu: {}, test bleu: {}'.format( epoch, step, val_bleu, test_bleu)) @@ -303,8 +319,8 @@ def _eval_epoch(sess, summary_writer, mode, trigger): print('saved to {}'.format(saved_path)) train_op, summary_op, trigger_ = { - 'xe0': (train_xe0_op, summary_xe0_op, None), - 'xe1': (train_xe1_op, summary_xe1_op, None), + 'xe0': (train_xe_op, summary_xe_op, None), + 'xe1': (train_xe_op, summary_xe_op, None), 'debleu': (train_debleu_op, summary_debleu_op, trigger) }[stage] _train_epoch(sess, summary_writer, train_op, summary_op, trigger_) diff --git a/examples/differentiable_expected_bleu/prepare_data.py b/examples/differentiable_expected_bleu/prepare_data.py index a7557c09..8a19075b 100644 --- a/examples/differentiable_expected_bleu/prepare_data.py +++ b/examples/differentiable_expected_bleu/prepare_data.py @@ -32,7 +32,7 @@ def prepare_data(): if FLAGS.data == 'de-en': tx.data.maybe_download( urls='https://drive.google.com/file/d/' - '1Vuv3bed10qUxrpldHdYoiWLzPKa4pNXd/view?usp=sharing', + '1y4mUWXRS2KstgHopCS9koZ42ENOh6Yb9/view?usp=sharing', path='./', filenames='iwslt14_de-en.zip', extract=True) diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py index f8e7040e..dba5087c 100644 --- a/texar/modules/decoders/rnn_decoder_helpers.py +++ b/texar/modules/decoders/rnn_decoder_helpers.py @@ -410,10 +410,12 @@ def __init__(self, inputs, sequence_length, embedding, n_unmask, self._zero_next_inputs = tf.zeros_like( self._embedding_fn(self._zero_inputs)) - self._n_unmask = tf.Variable(n_unmask, name='n_unmask') - self._n_mask = tf.Variable(n_mask, name='n_mask') + self._n_unmask = tf.get_variable( + "n_unmask", initializer=n_unmask, trainable=False) + self._n_mask = tf.get_variable( + "n_mask", initializer=n_mask, trainable=False) self._n_cycle = tf.add( - self._n_unmask, self._n_mask, name='n_cycle') + self._n_unmask, self._n_mask, name="n_cycle") self._new_n_unmask = tf.placeholder(shape=[], dtype=tf.int32) self._new_n_mask = tf.placeholder(shape=[], dtype=tf.int32) self._assign_n_unmask = tf.assign( @@ -421,7 +423,7 @@ def __init__(self, inputs, sequence_length, embedding, n_unmask, self._assign_n_mask = tf.assign(self._n_mask, self._new_n_mask) self._n_shift = tf.random_uniform( [], maxval=self._n_cycle, dtype=self._n_cycle.dtype, - seed=self._seed, name='n_shift') + seed=self._seed, name="n_shift") @property def sample_ids_dtype(self): From 9d6e4bb1f5d948aff5577928c3f31bb91ad465c3 Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Sun, 21 Oct 2018 22:50:50 -0400 Subject: [PATCH 49/65] replace name_scope by variable_scope in TeacherMaskSoftmaxEmbeddingHelper --- texar/modules/decoders/rnn_decoder_helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py index dba5087c..3ff9d419 100644 --- a/texar/modules/decoders/rnn_decoder_helpers.py +++ b/texar/modules/decoders/rnn_decoder_helpers.py @@ -394,8 +394,8 @@ class TeacherMaskSoftmaxEmbeddingHelper(TFTrainingHelper): def __init__(self, inputs, sequence_length, embedding, n_unmask, n_mask, tau=1., time_major=False, seed=None, stop_gradient=False, name=None): - with tf.name_scope(name, "TeacherMaskSoftmaxEmbeddingHelper", - [embedding, tau, seed, stop_gradient]): + with tf.variable_scope(name, "TeacherMaskSoftmaxEmbeddingHelper", + [embedding, tau, seed, stop_gradient]): super(TeacherMaskSoftmaxEmbeddingHelper, self).__init__( inputs=inputs, sequence_length=sequence_length, From ed1f6f3ec218c77ec13ffd17a75009e5dedf6aad Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Mon, 22 Oct 2018 14:03:23 -0400 Subject: [PATCH 50/65] fix lr bug --- .../differentiable_expected_bleu.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 90e26172..c4de2695 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -52,9 +52,11 @@ reinitialize_optimizer = FLAGS.reinitialize_optimizer mask_patterns = config_train.mask_patterns +d = config_train.train_xe["optimizer"]["kwargs"] if stage.startswith("xe"): - d = config_train.train_xe["optimizer"]["kwargs"] d["learning_rate"] = d["learning_rate"][int(stage[2:])] +else: + d["learning_rate"] = d["learning_rate"][-1] def get_scope_by_name(tensor): From 3cfc217598c0aa9cc8a107a792689f21616cfa1e Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Sun, 28 Oct 2018 23:35:58 -0400 Subject: [PATCH 51/65] reset model and configs to those in pytorch codes ; fix connector bug ; move best checkpoint into evaluation --- .../config_data_iwslt14_de-en.py | 2 +- .../config_data_iwslt14_en-fr.py | 2 +- .../config_model.py | 12 +++- .../config_train.py | 8 +-- .../differentiable_expected_bleu.py | 63 ++++++++++++------- 5 files changed, 58 insertions(+), 29 deletions(-) diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py index cf0c645f..a3236629 100644 --- a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py +++ b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py @@ -1,7 +1,7 @@ source_vocab_file = 'data/iwslt14_de-en/vocab.de' target_vocab_file = 'data/iwslt14_de-en/vocab.en' -batch_size = 160 +batch_size = 80 train = { 'batch_size': batch_size, diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py b/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py index a81090e6..4c4482f7 100644 --- a/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py +++ b/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py @@ -1,7 +1,7 @@ source_vocab_file = 'data/iwslt14_en-fr/vocab.en' target_vocab_file = 'data/iwslt14_en-fr/vocab.fr' -batch_size = 160 +batch_size = 80 train = { 'batch_size': batch_size, diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model.py index 125b1fc6..16dba9b9 100644 --- a/examples/differentiable_expected_bleu/config_model.py +++ b/examples/differentiable_expected_bleu/config_model.py @@ -1,8 +1,8 @@ # Attentional Seq2seq model. # Hyperparameters not specified here will take the default values. -num_units = 256 -embedding_dim = 256 +num_units = 1000 +embedding_dim = 500 embedder = { 'dim': embedding_dim @@ -13,7 +13,15 @@ 'kwargs': { 'num_units': num_units }, + 'num_layers': 2 }, + 'output_layer_fw': { + 'dropout_rate': 0 + } +} + +connector = { + 'activation_fn': 'tanh' } decoder = { diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py index bb80f0ca..19bfdfa7 100644 --- a/examples/differentiable_expected_bleu/config_train.py +++ b/examples/differentiable_expected_bleu/config_train.py @@ -1,11 +1,11 @@ max_epochs = 1000 -steps_per_eval = int(1e9) +steps_per_eval = 500 tau = 1. -infer_beam_width = 10 +infer_beam_width = 1 infer_max_decoding_length = 50 -mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)] -threshold_steps = 25000 +mask_patterns = [(2, 2), (4, 2), (1, 0)] +threshold_steps = int(1e9) minimum_interval_steps = 10000 train_xe = { diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index c4de2695..053a53e7 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -72,7 +72,8 @@ def build_model(batch, train_data): encoder = tx.modules.BidirectionalRNNEncoder( hparams=config_model.encoder) - enc_outputs, _ = encoder(source_embedder(batch['source_text_ids'])) + enc_outputs, enc_final_state = encoder( + source_embedder(batch['source_text_ids'])) target_embedder = tx.modules.WordEmbedder( vocab_size=train_data.target_vocab.size, hparams=config_model.embedder) @@ -83,9 +84,23 @@ def build_model(batch, train_data): vocab_size=train_data.target_vocab.size, hparams=config_model.decoder) + enc_final_state = tf.contrib.framework.nest.map_structure( + lambda *args: tf.concat(args, -1), *enc_final_state) + + if isinstance(decoder.cell, tf.nn.rnn_cell.LSTMCell): + connector = tx.modules.MLPTransformConnector( + decoder.state_size.h, hparams=config_model.connector) + dec_initial_h = connector(enc_final_state.h) + dec_initial_state = (dec_initial_h, enc_final_state.c) + else: + connector = tx.modules.MLPTransformConnector( + decoder.state_size, hparams=config_model.connector) + dec_initial_state = connector(enc_final_state) + # cross-entropy + teacher-forcing pretraining tf_outputs, _, _ = decoder( decoding_strategy='train_greedy', + initial_state=dec_initial_state, inputs=target_embedder(batch['target_text_ids'][:, :-1]), sequence_length=batch['target_length']-1) @@ -109,7 +124,8 @@ def build_model(batch, train_data): tau=config_train.tau) tm_outputs, _, _ = decoder( - helper=tm_helper) + helper=tm_helper, + initial_state=dec_initial_state) loss_debleu = tx.losses.debleu( labels=batch['target_text_ids'][:, 1:], @@ -130,6 +146,7 @@ def build_model(batch, train_data): embedding=target_embedder, start_tokens=start_tokens, end_token=end_token, + initial_state=dec_initial_state, beam_width=config_train.infer_beam_width, max_decoding_length=config_train.infer_max_decoding_length) @@ -178,6 +195,9 @@ def main(): saver = tf.train.Saver(max_to_keep=None) + global best_val_bleu + best_val_bleu = -1 + def _train_epoch(sess, summary_writer, train_op, summary_op, trigger): print('in _train_epoch') @@ -243,15 +263,30 @@ def _eval_epoch(sess, summary_writer, mode, trigger): summary_writer.add_summary(summary, step) summary_writer.flush() - if trigger is not None: - triggered, _ = trigger(step, bleu) - if triggered: - print('triggered!') + if mode == 'val': + if trigger is not None: + triggered, _ = trigger(step, bleu) + if triggered: + print('triggered!') + + global best_val_bleu + if bleu > best_val_bleu: + best_val_bleu = bleu + print('update best val bleu: {}'.format(best_val_bleu)) + + saved_path = saver.save( + sess, ckpt_best, global_step=step) + + if stage == 'debleu': + with open('{}.trigger'.format(saved_path), 'wb') as \ + pickle_file: + trigger.save_to_pickle(pickle_file) + + print('saved to {}'.format(saved_path)) print('end _eval_epoch') return bleu - best_val_bleu = -1 with tf.Session() as sess: def action_of_mask(mask_pattern): sess.run(train_debleu_op_initializer) @@ -306,20 +341,6 @@ def action_of_mask(mask_pattern): print('epoch: {}, step: {}, val bleu: {}, test bleu: {}'.format( epoch, step, val_bleu, test_bleu)) - if val_bleu > best_val_bleu: - best_val_bleu = val_bleu - print('update best val bleu: {}'.format(best_val_bleu)) - - saved_path = saver.save( - sess, ckpt_best, global_step=step) - - if stage == 'debleu': - with open('{}.trigger'.format(saved_path), 'wb') as \ - pickle_file: - trigger.save_to_pickle(pickle_file) - - print('saved to {}'.format(saved_path)) - train_op, summary_op, trigger_ = { 'xe0': (train_xe_op, summary_xe_op, None), 'xe1': (train_xe_op, summary_xe_op, None), From 18da2c7fa739b4c127eed56e0105d2c4cfcec9c8 Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Mon, 29 Oct 2018 15:33:48 -0400 Subject: [PATCH 52/65] anneal to bs160 4:2 mask ; reinitialize mask after restoring --- .../config_data_iwslt14_de-en_bs160.py | 45 +++++++++++++++++++ .../config_train_4_2.py | 41 +++++++++++++++++ .../differentiable_expected_bleu.py | 4 ++ 3 files changed, 90 insertions(+) create mode 100644 examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py create mode 100644 examples/differentiable_expected_bleu/config_train_4_2.py diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py new file mode 100644 index 00000000..cf0c645f --- /dev/null +++ b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py @@ -0,0 +1,45 @@ +source_vocab_file = 'data/iwslt14_de-en/vocab.de' +target_vocab_file = 'data/iwslt14_de-en/vocab.en' + +batch_size = 160 + +train = { + 'batch_size': batch_size, + 'allow_smaller_final_batch': False, + 'source_dataset': { + "files": 'data/iwslt14_de-en/train.de', + 'vocab_file': source_vocab_file, + 'max_seq_length': 50 + }, + 'target_dataset': { + 'files': 'data/iwslt14_de-en/train.en', + 'vocab_file': target_vocab_file, + 'max_seq_length': 50 + }, +} + +val = { + 'batch_size': batch_size, + 'shuffle': False, + 'source_dataset': { + "files": 'data/iwslt14_de-en/valid.de', + 'vocab_file': source_vocab_file, + }, + 'target_dataset': { + 'files': 'data/iwslt14_de-en/valid.en', + 'vocab_file': target_vocab_file, + }, +} + +test = { + 'batch_size': batch_size, + 'shuffle': False, + 'source_dataset': { + "files": 'data/iwslt14_de-en/test.de', + 'vocab_file': source_vocab_file, + }, + 'target_dataset': { + 'files': 'data/iwslt14_de-en/test.en', + 'vocab_file': target_vocab_file, + }, +} diff --git a/examples/differentiable_expected_bleu/config_train_4_2.py b/examples/differentiable_expected_bleu/config_train_4_2.py new file mode 100644 index 00000000..4c8cc276 --- /dev/null +++ b/examples/differentiable_expected_bleu/config_train_4_2.py @@ -0,0 +1,41 @@ +max_epochs = 1000 +steps_per_eval = 500 +tau = 1. +infer_beam_width = 1 +infer_max_decoding_length = 50 + +mask_patterns = [(4, 2), (1, 0)] +threshold_steps = int(1e9) +minimum_interval_steps = 10000 + +train_xe = { + "optimizer": { + "type": "AdamOptimizer", + "kwargs": { + "learning_rate": [1e-3, 1e-5] + } + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, + "name": "XE" +} + +train_debleu = { + "optimizer": { + "type": "AdamOptimizer", + "kwargs": { + "learning_rate": 1e-5, + } + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, + "name": "DEBLEU" +} diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 053a53e7..c1e05851 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -179,6 +179,9 @@ def main(): (train_xe_op, "train_xe_op_initializer"), (train_debleu_op, "train_debleu_op_initializer")]] + tm_helper_initializer = tf.variables_initializer( + [tm_helper.n_unmask, tm_helper.n_mask], name="tm_helper_initializer") + summary_tm = [ tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask), tf.summary.scalar('tm/n_mask', tm_helper.n_mask)] @@ -317,6 +320,7 @@ def action_of_mask(mask_pattern): if reinitialize_optimizer: sess.run(train_xe_op_initializer) sess.run(train_debleu_op_initializer) + sess.run(tm_helper_initializer) trigger_path = '{}.trigger'.format(ckpt_path) if os.path.exists(trigger_path): From 1ac619dfbbbde41eaadef8badff91c8193cb427f Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Mon, 29 Oct 2018 23:48:43 -0400 Subject: [PATCH 53/65] add lr1e6_1_0.py config --- .../config_train_lr1e6_1_0.py | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py diff --git a/examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py b/examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py new file mode 100644 index 00000000..faad17d7 --- /dev/null +++ b/examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py @@ -0,0 +1,41 @@ +max_epochs = 1000 +steps_per_eval = 500 +tau = 1. +infer_beam_width = 1 +infer_max_decoding_length = 50 + +mask_patterns = [(1, 0)] +threshold_steps = int(1e9) +minimum_interval_steps = 10000 + +train_xe = { + "optimizer": { + "type": "AdamOptimizer", + "kwargs": { + "learning_rate": [1e-3, 1e-5] + } + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, + "name": "XE" +} + +train_debleu = { + "optimizer": { + "type": "AdamOptimizer", + "kwargs": { + "learning_rate": 1e-6, + } + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + }, + }, + "name": "DEBLEU" +} From c227c28e30705d3444ddada27c4ebb049cacdff4 Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Thu, 1 Nov 2018 22:58:13 -0400 Subject: [PATCH 54/65] add more model configs --- ...{config_model.py => config_model_large.py} | 0 .../config_model_medium.py | 40 +++++++++++++++++++ .../differentiable_expected_bleu.py | 34 +++++++++------- 3 files changed, 60 insertions(+), 14 deletions(-) rename examples/differentiable_expected_bleu/{config_model.py => config_model_large.py} (100%) create mode 100644 examples/differentiable_expected_bleu/config_model_medium.py diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model_large.py similarity index 100% rename from examples/differentiable_expected_bleu/config_model.py rename to examples/differentiable_expected_bleu/config_model_large.py diff --git a/examples/differentiable_expected_bleu/config_model_medium.py b/examples/differentiable_expected_bleu/config_model_medium.py new file mode 100644 index 00000000..7750a97c --- /dev/null +++ b/examples/differentiable_expected_bleu/config_model_medium.py @@ -0,0 +1,40 @@ +# Attentional Seq2seq model. +# Hyperparameters not specified here will take the default values. + +num_units = 256 +embedding_dim = 256 +dropout = 0.2 + +embedder = { + 'dim': embedding_dim +} + +encoder = { + 'rnn_cell_fw': { + 'kwargs': { + 'num_units': num_units + }, + 'dropout': { + 'input_keep_prob': 1. - dropout + } + } +} + +connector = None + +decoder = { + 'rnn_cell': { + 'kwargs': { + 'num_units': num_units + }, + 'dropout': { + 'input_keep_prob': 1. - dropout + } + }, + 'attention': { + 'kwargs': { + 'num_units': num_units, + }, + 'attention_layer_size': num_units + } +} diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index c1e05851..17c5d55b 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -29,9 +29,8 @@ flags = tf.flags -flags.DEFINE_string("config_train", "config_train", - "The training config.") -flags.DEFINE_string("config_model", "config_model", "The model config.") +flags.DEFINE_string("config_train", "config_train", "The training config.") +flags.DEFINE_string("config_model", "config_model_medium", "The model config.") flags.DEFINE_string("config_data", "config_data_iwslt14_de-en", "The dataset config.") flags.DEFINE_string("expr_name", "iwslt14_de-en", "The experiment name. " @@ -84,18 +83,22 @@ def build_model(batch, train_data): vocab_size=train_data.target_vocab.size, hparams=config_model.decoder) - enc_final_state = tf.contrib.framework.nest.map_structure( - lambda *args: tf.concat(args, -1), *enc_final_state) + if config_model.connector is None: + dec_initial_state = None - if isinstance(decoder.cell, tf.nn.rnn_cell.LSTMCell): - connector = tx.modules.MLPTransformConnector( - decoder.state_size.h, hparams=config_model.connector) - dec_initial_h = connector(enc_final_state.h) - dec_initial_state = (dec_initial_h, enc_final_state.c) else: - connector = tx.modules.MLPTransformConnector( - decoder.state_size, hparams=config_model.connector) - dec_initial_state = connector(enc_final_state) + enc_final_state = tf.contrib.framework.nest.map_structure( + lambda *args: tf.concat(args, -1), *enc_final_state) + + if isinstance(decoder.cell, tf.nn.rnn_cell.LSTMCell): + connector = tx.modules.MLPTransformConnector( + decoder.state_size.h, hparams=config_model.connector) + dec_initial_h = connector(enc_final_state.h) + dec_initial_state = (dec_initial_h, enc_final_state.c) + else: + connector = tx.modules.MLPTransformConnector( + decoder.state_size, hparams=config_model.connector) + dec_initial_state = connector(enc_final_state) # cross-entropy + teacher-forcing pretraining tf_outputs, _, _ = decoder( @@ -342,7 +345,7 @@ def action_of_mask(mask_pattern): val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger) test_bleu = _eval_epoch(sess, summary_writer, 'test', None) step = tf.train.global_step(sess, global_step) - print('epoch: {}, step: {}, val bleu: {}, test bleu: {}'.format( + print('epoch: {}, step: {}, val BLEU: {}, test BLEU: {}'.format( epoch, step, val_bleu, test_bleu)) train_op, summary_op, trigger_ = { @@ -362,6 +365,9 @@ def action_of_mask(mask_pattern): print('saved to {}'.format(saved_path)) + test_bleu = _eval_epoch(sess, summary_writer, 'test', None) + print('test BLEU: {}'.format(test_bleu)) + if __name__ == '__main__': main() From 316e41c452bb7b1727167f8441c11f7c0c137b8f Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Sat, 3 Nov 2018 02:03:13 -0400 Subject: [PATCH 55/65] refine code ; now everything is automatical --- .../config_data_iwslt14_de-en.py | 24 +- .../config_data_iwslt14_de-en_bs160.py | 45 ---- .../config_train.py | 59 ++++- .../config_train_4_2.py | 41 ---- .../config_train_lr1e6_1_0.py | 41 ---- .../differentiable_expected_bleu.py | 219 +++++++++--------- texar/utils/triggers.py | 23 ++ 7 files changed, 203 insertions(+), 249 deletions(-) delete mode 100644 examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py delete mode 100644 examples/differentiable_expected_bleu/config_train_4_2.py delete mode 100644 examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py index a3236629..ae3979f5 100644 --- a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py +++ b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py @@ -1,10 +1,23 @@ source_vocab_file = 'data/iwslt14_de-en/vocab.de' target_vocab_file = 'data/iwslt14_de-en/vocab.en' -batch_size = 80 +train_0 = { + 'batch_size': 80, + 'allow_smaller_final_batch': False, + 'source_dataset': { + "files": 'data/iwslt14_de-en/train.de', + 'vocab_file': source_vocab_file, + 'max_seq_length': 50 + }, + 'target_dataset': { + 'files': 'data/iwslt14_de-en/train.en', + 'vocab_file': target_vocab_file, + 'max_seq_length': 50 + }, +} -train = { - 'batch_size': batch_size, +train_1 = { + 'batch_size': 160, 'allow_smaller_final_batch': False, 'source_dataset': { "files": 'data/iwslt14_de-en/train.de', @@ -18,8 +31,9 @@ }, } + val = { - 'batch_size': batch_size, + 'batch_size': 80, 'shuffle': False, 'source_dataset': { "files": 'data/iwslt14_de-en/valid.de', @@ -32,7 +46,7 @@ } test = { - 'batch_size': batch_size, + 'batch_size': 80, 'shuffle': False, 'source_dataset': { "files": 'data/iwslt14_de-en/test.de', diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py deleted file mode 100644 index cf0c645f..00000000 --- a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py +++ /dev/null @@ -1,45 +0,0 @@ -source_vocab_file = 'data/iwslt14_de-en/vocab.de' -target_vocab_file = 'data/iwslt14_de-en/vocab.en' - -batch_size = 160 - -train = { - 'batch_size': batch_size, - 'allow_smaller_final_batch': False, - 'source_dataset': { - "files": 'data/iwslt14_de-en/train.de', - 'vocab_file': source_vocab_file, - 'max_seq_length': 50 - }, - 'target_dataset': { - 'files': 'data/iwslt14_de-en/train.en', - 'vocab_file': target_vocab_file, - 'max_seq_length': 50 - }, -} - -val = { - 'batch_size': batch_size, - 'shuffle': False, - 'source_dataset': { - "files": 'data/iwslt14_de-en/valid.de', - 'vocab_file': source_vocab_file, - }, - 'target_dataset': { - 'files': 'data/iwslt14_de-en/valid.en', - 'vocab_file': target_vocab_file, - }, -} - -test = { - 'batch_size': batch_size, - 'shuffle': False, - 'source_dataset': { - "files": 'data/iwslt14_de-en/test.de', - 'vocab_file': source_vocab_file, - }, - 'target_dataset': { - 'files': 'data/iwslt14_de-en/test.en', - 'vocab_file': target_vocab_file, - }, -} diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py index 19bfdfa7..09d3464f 100644 --- a/examples/differentiable_expected_bleu/config_train.py +++ b/examples/differentiable_expected_bleu/config_train.py @@ -4,38 +4,77 @@ infer_beam_width = 1 infer_max_decoding_length = 50 -mask_patterns = [(2, 2), (4, 2), (1, 0)] -threshold_steps = int(1e9) +threshold_steps = 10000 minimum_interval_steps = 10000 +phases = [ + # (config_data, config_train, mask_pattern) + ("train_0", "xe_0", None), + ("train_0", "xe_1", None), + ("train_0", "debleu_0", (2, 2)), + ("train_1", "debleu_0", (4, 2)), + ("train_1", "debleu_1", (1, 0)), +] -train_xe = { +train_xe_0 = { "optimizer": { "type": "AdamOptimizer", "kwargs": { - "learning_rate": [1e-3, 1e-5] + "learning_rate": 1e-3 } }, "gradient_clip": { "type": "clip_by_global_norm", "kwargs": { "clip_norm": 5. - }, + } + }, + "name": "XE_0" +} + +train_xe_1 = { + "optimizer": { + "type": "AdamOptimizer", + "kwargs": { + "learning_rate": 1e-5 + } + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + } }, - "name": "XE" + "name": "XE_1" } -train_debleu = { +train_debleu_0 = { "optimizer": { "type": "AdamOptimizer", "kwargs": { - "learning_rate": 1e-5, + "learning_rate": 1e-5 } }, "gradient_clip": { "type": "clip_by_global_norm", "kwargs": { "clip_norm": 5. - }, + } + }, + "name": "DEBLEU_0" +} + +train_debleu_1 = { + "optimizer": { + "type": "AdamOptimizer", + "kwargs": { + "learning_rate": 1e-6 + } + }, + "gradient_clip": { + "type": "clip_by_global_norm", + "kwargs": { + "clip_norm": 5. + } }, - "name": "DEBLEU" + "name": "DEBLEU_1" } diff --git a/examples/differentiable_expected_bleu/config_train_4_2.py b/examples/differentiable_expected_bleu/config_train_4_2.py deleted file mode 100644 index 4c8cc276..00000000 --- a/examples/differentiable_expected_bleu/config_train_4_2.py +++ /dev/null @@ -1,41 +0,0 @@ -max_epochs = 1000 -steps_per_eval = 500 -tau = 1. -infer_beam_width = 1 -infer_max_decoding_length = 50 - -mask_patterns = [(4, 2), (1, 0)] -threshold_steps = int(1e9) -minimum_interval_steps = 10000 - -train_xe = { - "optimizer": { - "type": "AdamOptimizer", - "kwargs": { - "learning_rate": [1e-3, 1e-5] - } - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "XE" -} - -train_debleu = { - "optimizer": { - "type": "AdamOptimizer", - "kwargs": { - "learning_rate": 1e-5, - } - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "DEBLEU" -} diff --git a/examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py b/examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py deleted file mode 100644 index faad17d7..00000000 --- a/examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py +++ /dev/null @@ -1,41 +0,0 @@ -max_epochs = 1000 -steps_per_eval = 500 -tau = 1. -infer_beam_width = 1 -infer_max_decoding_length = 50 - -mask_patterns = [(1, 0)] -threshold_steps = int(1e9) -minimum_interval_steps = 10000 - -train_xe = { - "optimizer": { - "type": "AdamOptimizer", - "kwargs": { - "learning_rate": [1e-3, 1e-5] - } - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "XE" -} - -train_debleu = { - "optimizer": { - "type": "AdamOptimizer", - "kwargs": { - "learning_rate": 1e-6, - } - }, - "gradient_clip": { - "type": "clip_by_global_norm", - "kwargs": { - "clip_norm": 5. - }, - }, - "name": "DEBLEU" -} diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 17c5d55b..a45e2b83 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -29,34 +29,26 @@ flags = tf.flags -flags.DEFINE_string("config_train", "config_train", "The training config.") flags.DEFINE_string("config_model", "config_model_medium", "The model config.") flags.DEFINE_string("config_data", "config_data_iwslt14_de-en", "The dataset config.") +flags.DEFINE_string("config_train", "config_train", "The training config.") flags.DEFINE_string("expr_name", "iwslt14_de-en", "The experiment name. " - "Also used as the directory name of run.") -flags.DEFINE_integer("pretrain_epochs", 10000, "Number of pretraining epochs.") -flags.DEFINE_string("stage", "xe0", "stage.") -flags.DEFINE_boolean("reinitialize_optimizer", False, "Whether to reinitialize " - "optimizer state before training.") + "Used as the directory name of run.") +flags.DEFINE_boolean("reinitialize", True, "Whether to reinitialize the state " + "of the optimizers before training and after triggering.") FLAGS = flags.FLAGS -config_train = importlib.import_module(FLAGS.config_train) config_model = importlib.import_module(FLAGS.config_model) config_data = importlib.import_module(FLAGS.config_data) +config_train = importlib.import_module(FLAGS.config_train) expr_name = FLAGS.expr_name -pretrain_epochs = FLAGS.pretrain_epochs -stage = FLAGS.stage -reinitialize_optimizer = FLAGS.reinitialize_optimizer -mask_patterns = config_train.mask_patterns - -d = config_train.train_xe["optimizer"]["kwargs"] -if stage.startswith("xe"): - d["learning_rate"] = d["learning_rate"][int(stage[2:])] -else: - d["learning_rate"] = d["learning_rate"][-1] +reinitialize = FLAGS.reinitialize +phases = config_train.phases +xe_names = ('xe_0', 'xe_1') +debleu_names = ('debleu_0', 'debleu_1') def get_scope_by_name(tensor): return tensor.name[: tensor.name.rfind('/') + 1] @@ -65,6 +57,8 @@ def get_scope_by_name(tensor): def build_model(batch, train_data): """Assembles the seq2seq model. """ + train_ops = {} + source_embedder = tx.modules.WordEmbedder( vocab_size=train_data.source_vocab.size, hparams=config_model.embedder) @@ -112,9 +106,12 @@ def build_model(batch, train_data): logits=tf_outputs.logits, sequence_length=batch['target_length']-1) - train_xe_op = tx.core.get_train_op( + train_ops[xe_names[0]] = tx.core.get_train_op( + loss_xe, + hparams=config_train.train_xe_0) + train_ops[xe_names[1]] = tx.core.get_train_op( loss_xe, - hparams=config_train.train_xe) + hparams=config_train.train_xe_1) # teacher mask + DEBLEU fine-tuning tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper( @@ -122,8 +119,8 @@ def build_model(batch, train_data): inputs=batch['target_text_ids'], sequence_length=batch['target_length']-1, embedding=target_embedder, - n_unmask=mask_patterns[0][0], - n_mask=mask_patterns[0][1], + n_unmask=1, + n_mask=0, tau=config_train.tau) tm_outputs, _, _ = decoder( @@ -135,9 +132,12 @@ def build_model(batch, train_data): probs=tm_outputs.sample_id, sequence_length=batch['target_length']-1) - train_debleu_op = tx.core.get_train_op( + train_ops[debleu_names[0]] = tx.core.get_train_op( loss_debleu, - hparams=config_train.train_debleu) + hparams=config_train.train_debleu_0) + train_ops[debleu_names[1]] = tx.core.get_train_op( + loss_debleu, + hparams=config_train.train_debleu_1) # inference: beam search decoding start_tokens = tf.ones_like(batch['target_length']) * \ @@ -153,64 +153,78 @@ def build_model(batch, train_data): beam_width=config_train.infer_beam_width, max_decoding_length=config_train.infer_max_decoding_length) - return train_xe_op, train_debleu_op, tm_helper, bs_outputs + return train_ops, tm_helper, bs_outputs def main(): """Entrypoint. """ - train_data = tx.data.PairedTextData(hparams=config_data.train) + train_0_data = tx.data.PairedTextData(hparams=config_data.train_0) + train_1_data = tx.data.PairedTextData(hparams=config_data.train_1) val_data = tx.data.PairedTextData(hparams=config_data.val) test_data = tx.data.PairedTextData(hparams=config_data.test) data_iterator = tx.data.FeedableDataIterator( - {'train': train_data, 'val': val_data, 'test': test_data}) - + {'train_0': train_0_data, 'train_1': train_1_data, + 'val': val_data, 'test': test_data}) data_batch = data_iterator.get_next() global_step = tf.train.create_global_step() - train_xe_op, train_debleu_op, tm_helper, infer_outputs = \ - build_model(data_batch, train_data) + train_ops, tm_helper, infer_outputs = build_model(data_batch, train_0_data) + + def get_train_op_scope(name): + return get_scope_by_name(train_ops[name]) - train_xe_op_initializer, train_debleu_op_initializer = [ - tf.variables_initializer( + train_op_initializers = { + name: tf.variables_initializer( tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, - scope=get_scope_by_name(train_op)), - name=name) - for train_op, name in [ - (train_xe_op, "train_xe_op_initializer"), - (train_debleu_op, "train_debleu_op_initializer")]] - + scope=get_train_op_scope(name)), + name='train_{}_op_initializer'.format(name)) + for name in (xe_names + debleu_names)} tm_helper_initializer = tf.variables_initializer( [tm_helper.n_unmask, tm_helper.n_mask], name="tm_helper_initializer") summary_tm = [ tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask), tf.summary.scalar('tm/n_mask', tm_helper.n_mask)] - summary_xe_op = tf.summary.merge( - tf.get_collection( - tf.GraphKeys.SUMMARIES, - scope=get_scope_by_name(train_xe_op)), - name='summary_xe') - summary_debleu_op = tf.summary.merge( - tf.get_collection( - tf.GraphKeys.SUMMARIES, - scope=get_scope_by_name(train_debleu_op)) + summary_tm, - name='summary_debleu') + summary_ops = { + name: tf.summary.merge( + tf.get_collection( + tf.GraphKeys.SUMMARIES, + scope=get_train_op_scope(name)) + + (summary_tm if name in debleu_names else []), + name='summary_{}'.format(name)) + for name in (xe_names + debleu_names)} saver = tf.train.Saver(max_to_keep=None) - global best_val_bleu - best_val_bleu = -1 + def _restore_from(directory, restore_trigger): + if os.path.exists(directory): + ckpt_path = tf.train.latest_checkpoint(directory) + print('restoring from {} ...'.format(ckpt_path)) + saver.restore(sess, ckpt_path) + + if restore_trigger: + trigger_path = '{}.trigger'.format(ckpt_path) + if os.path.exists(trigger_path): + with open(trigger_path, 'rb') as pickle_file: + trigger.restore_from_pickle(pickle_file) + else: + print('cannot find previous trigger state.') + + print('done.') - def _train_epoch(sess, summary_writer, train_op, summary_op, trigger): + else: + print('cannot find checkpoint directory {}'.format(directory)) + + def _train_epoch(sess, summary_writer, mode, train_op, summary_op, trigger): print('in _train_epoch') - data_iterator.restart_dataset(sess, 'train') + data_iterator.restart_dataset(sess, mode) feed_dict = { tx.global_mode(): tf.estimator.ModeKeys.TRAIN, - data_iterator.handle: data_iterator.get_handle(sess, 'train') + data_iterator.handle: data_iterator.get_handle(sess, mode) } while True: @@ -221,7 +235,10 @@ def _train_epoch(sess, summary_writer, train_op, summary_op, trigger): summary_writer.add_summary(summary, step) if step % config_train.steps_per_eval == 0: + global triggered _eval_epoch(sess, summary_writer, 'val', trigger) + if triggered: + break except tf.errors.OutOfRangeError: break @@ -271,41 +288,25 @@ def _eval_epoch(sess, summary_writer, mode, trigger): if mode == 'val': if trigger is not None: - triggered, _ = trigger(step, bleu) - if triggered: - print('triggered!') - - global best_val_bleu - if bleu > best_val_bleu: - best_val_bleu = bleu - print('update best val bleu: {}'.format(best_val_bleu)) - - saved_path = saver.save( - sess, ckpt_best, global_step=step) + if (trigger.best_ever_score is not None and + bleu > trigger.best_ever_score): + print('update best val bleu: {}'.format(bleu)) - if stage == 'debleu': + saved_path = saver.save(sess, ckpt_best, global_step=step) with open('{}.trigger'.format(saved_path), 'wb') as \ pickle_file: trigger.save_to_pickle(pickle_file) + print('saved to {}'.format(saved_path)) - print('saved to {}'.format(saved_path)) + global triggered + triggered, _ = trigger(step, bleu) + if triggered: + print('triggered!') print('end _eval_epoch') return bleu with tf.Session() as sess: - def action_of_mask(mask_pattern): - sess.run(train_debleu_op_initializer) - tm_helper.assign_mask_pattern(sess, *mask_pattern) - - action = (action_of_mask(mask_pattern) - for mask_pattern in mask_patterns[1:]) - trigger = tx.utils.BestEverConvergenceTrigger( - action, - config_train.threshold_steps, - config_train.minimum_interval_steps, - default=None) - sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) @@ -315,58 +316,62 @@ def action_of_mask(mask_pattern): ckpt_model = os.path.join(dir_model, 'model.ckpt') ckpt_best = os.path.join(dir_best, 'model.ckpt') - if os.path.exists(dir_model): - ckpt_path = tf.train.latest_checkpoint(dir_model) - print('restoring from {} ...'.format(ckpt_path)) - saver.restore(sess, ckpt_path) - - if reinitialize_optimizer: - sess.run(train_xe_op_initializer) - sess.run(train_debleu_op_initializer) - sess.run(tm_helper_initializer) - - trigger_path = '{}.trigger'.format(ckpt_path) - if os.path.exists(trigger_path): - with open(trigger_path, 'rb') as pickle_file: - trigger.restore_from_pickle(pickle_file) - else: - print('cannot find previous trigger state.') + def action_before_phase(phase): + global train_data_name, train_op_name, mask_pattern,\ + train_op, summary_op + train_data_name, train_op_name, mask_pattern = phase + train_op = train_ops[train_op_name] + summary_op = summary_ops[train_op_name] + if reinitialize: + sess.run(train_op_initializers[train_op_name]) + if mask_pattern is not None: + tm_helper.assign_mask_pattern(sess, *mask_pattern) + + action = (action_before_phase(phase) for phase in phases) + next(action) + trigger = tx.utils.BestEverConvergenceTrigger( + action, + config_train.threshold_steps, + config_train.minimum_interval_steps, + default=None) - print('done.') + _restore_from(dir_model, restore_trigger=True) summary_writer = tf.summary.FileWriter( os.path.join(expr_name, 'log'), sess.graph, flush_secs=30) epoch = 0 while epoch < config_train.max_epochs: - print('epoch #{}{}:'.format( - epoch, ' ({})'.format(stage))) + print('epoch #{} {}:'.format( + epoch, (train_data_name, train_op_name, mask_pattern))) val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger) + if triggered: + _restore_from(dir_best, restore_trigger=False) + test_bleu = _eval_epoch(sess, summary_writer, 'test', None) + step = tf.train.global_step(sess, global_step) + print('epoch: {}, step: {}, val BLEU: {}, test BLEU: {}'.format( epoch, step, val_bleu, test_bleu)) - train_op, summary_op, trigger_ = { - 'xe0': (train_xe_op, summary_xe_op, None), - 'xe1': (train_xe_op, summary_xe_op, None), - 'debleu': (train_debleu_op, summary_debleu_op, trigger) - }[stage] - _train_epoch(sess, summary_writer, train_op, summary_op, trigger_) + _train_epoch(sess, summary_writer, train_data_name, + train_op, summary_op, trigger) + if triggered: + _restore_from(dir_best, restore_trigger=False) + epoch += 1 step = tf.train.global_step(sess, global_step) saved_path = saver.save(sess, ckpt_model, global_step=step) - - if stage == 'debleu': - with open('{}.trigger'.format(saved_path), 'wb') as pickle_file: - trigger.save_to_pickle(pickle_file) + with open('{}.trigger'.format(saved_path), 'wb') as pickle_file: + trigger.save_to_pickle(pickle_file) print('saved to {}'.format(saved_path)) test_bleu = _eval_epoch(sess, summary_writer, 'test', None) - print('test BLEU: {}'.format(test_bleu)) + print('epoch: {}, test BLEU: {}'.format(epoch, test_bleu)) if __name__ == '__main__': diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py index ce3c7183..ba1ab0b5 100644 --- a/texar/utils/triggers.py +++ b/texar/utils/triggers.py @@ -209,6 +209,11 @@ def __call__(self, step, score): step (int): Current training step to update. The training step must be updated in ascending order. score (float): Current value of the maintained metric. + + Returns: + A tuple `(triggered, retval)`, where boolean `triggered` denotes + whether triggered this time and `retval` is the return value of the + action performed this time. """ return super(BestEverConvergenceTrigger, self).__call__(step, score) @@ -217,6 +222,24 @@ def _state_names(self): return super(BestEverConvergenceTrigger, self)._state_names + [ '_last_triggered_step', '_best_ever_step', '_best_ever_score'] + @property + def last_triggered_step(self): + """The step at which the Trigger last triggered. + """ + return self._last_triggered_step + + @property + def best_ever_step(self): + """The step at which the best-ever score is reached. + """ + return self._best_ever_step + + @property + def best_ever_score(self): + """The best-ever score. + """ + return self._best_ever_score + class MovingAverageConvergenceTrigger(Trigger): From 0f157e89abcd334c79e8952e378966e743118924 Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Sat, 3 Nov 2018 23:12:03 -0400 Subject: [PATCH 56/65] make mask pattern Tensors and use placeholder --- .../differentiable_expected_bleu.py | 19 ++++++++++--------- texar/modules/decoders/rnn_decoder_helpers.py | 16 ++-------------- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index a45e2b83..e39ea949 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -114,13 +114,15 @@ def build_model(batch, train_data): hparams=config_train.train_xe_1) # teacher mask + DEBLEU fine-tuning + n_unmask = tf.placeholder(tf.int32, shape=[], name="n_unmask") + n_mask = tf.placeholder(tf.int32, shape=[], name="n_mask") tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper( # must not remove last token, since it may be used as mask inputs=batch['target_text_ids'], sequence_length=batch['target_length']-1, embedding=target_embedder, - n_unmask=1, - n_mask=0, + n_unmask=n_unmask, + n_mask=n_mask, tau=config_train.tau) tm_outputs, _, _ = decoder( @@ -153,7 +155,7 @@ def build_model(batch, train_data): beam_width=config_train.infer_beam_width, max_decoding_length=config_train.infer_max_decoding_length) - return train_ops, tm_helper, bs_outputs + return train_ops, tm_helper, (n_unmask, n_mask), bs_outputs def main(): @@ -170,7 +172,8 @@ def main(): global_step = tf.train.create_global_step() - train_ops, tm_helper, infer_outputs = build_model(data_batch, train_0_data) + train_ops, tm_helper, mask_pattern_, infer_outputs = build_model( + data_batch, train_0_data) def get_train_op_scope(name): return get_scope_by_name(train_ops[name]) @@ -182,8 +185,6 @@ def get_train_op_scope(name): scope=get_train_op_scope(name)), name='train_{}_op_initializer'.format(name)) for name in (xe_names + debleu_names)} - tm_helper_initializer = tf.variables_initializer( - [tm_helper.n_unmask, tm_helper.n_mask], name="tm_helper_initializer") summary_tm = [ tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask), @@ -224,7 +225,9 @@ def _train_epoch(sess, summary_writer, mode, train_op, summary_op, trigger): data_iterator.restart_dataset(sess, mode) feed_dict = { tx.global_mode(): tf.estimator.ModeKeys.TRAIN, - data_iterator.handle: data_iterator.get_handle(sess, mode) + data_iterator.handle: data_iterator.get_handle(sess, mode), + mask_pattern_[0]: mask_pattern[0], + mask_pattern_[1]: mask_pattern[1], } while True: @@ -324,8 +327,6 @@ def action_before_phase(phase): summary_op = summary_ops[train_op_name] if reinitialize: sess.run(train_op_initializers[train_op_name]) - if mask_pattern is not None: - tm_helper.assign_mask_pattern(sess, *mask_pattern) action = (action_before_phase(phase) for phase in phases) next(action) diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py index 3ff9d419..4af4f874 100644 --- a/texar/modules/decoders/rnn_decoder_helpers.py +++ b/texar/modules/decoders/rnn_decoder_helpers.py @@ -410,17 +410,10 @@ def __init__(self, inputs, sequence_length, embedding, n_unmask, self._zero_next_inputs = tf.zeros_like( self._embedding_fn(self._zero_inputs)) - self._n_unmask = tf.get_variable( - "n_unmask", initializer=n_unmask, trainable=False) - self._n_mask = tf.get_variable( - "n_mask", initializer=n_mask, trainable=False) + self._n_unmask = n_unmask + self._n_mask = n_mask self._n_cycle = tf.add( self._n_unmask, self._n_mask, name="n_cycle") - self._new_n_unmask = tf.placeholder(shape=[], dtype=tf.int32) - self._new_n_mask = tf.placeholder(shape=[], dtype=tf.int32) - self._assign_n_unmask = tf.assign( - self._n_unmask, self._new_n_unmask) - self._assign_n_mask = tf.assign(self._n_mask, self._new_n_mask) self._n_shift = tf.random_uniform( [], maxval=self._n_cycle, dtype=self._n_cycle.dtype, seed=self._seed, name="n_shift") @@ -441,11 +434,6 @@ def n_unmask(self): def n_mask(self): return self._n_mask - def assign_mask_pattern(self, sess, n_unmask, n_mask): - sess.run([self._assign_n_unmask, self._assign_n_mask], - feed_dict={self._new_n_unmask: n_unmask, - self._new_n_mask: n_mask}) - def _is_masked(self, time): return (time + self._n_shift) % self._n_cycle < self._n_mask From c4c428897e2e587f86db1c59788cdc587b58c219 Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Sun, 4 Nov 2018 15:29:01 -0500 Subject: [PATCH 57/65] reconstruct triggers ; modify code --- .../differentiable_expected_bleu.py | 132 ++++++++++-------- texar/utils/triggers.py | 118 ++++++++-------- 2 files changed, 138 insertions(+), 112 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index e39ea949..6c567fe3 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Attentional Seq2seq. +"""DEBLEU. """ from __future__ import absolute_import from __future__ import print_function @@ -50,6 +50,12 @@ xe_names = ('xe_0', 'xe_1') debleu_names = ('debleu_0', 'debleu_1') +dir_model = os.path.join(expr_name, 'ckpt') +dir_best = os.path.join(expr_name, 'ckpt-best') +ckpt_model = os.path.join(dir_model, 'model.ckpt') +ckpt_best = os.path.join(dir_best, 'model.ckpt') + + def get_scope_by_name(tensor): return tensor.name[: tensor.name.rfind('/') + 1] @@ -198,37 +204,60 @@ def get_train_op_scope(name): name='summary_{}'.format(name)) for name in (xe_names + debleu_names)} + global convergence_trigger + convergence_trigger = tx.utils.BestEverConvergenceTrigger( + None, + lambda state: state, + config_train.threshold_steps, + config_train.minimum_interval_steps) + saver = tf.train.Saver(max_to_keep=None) - def _restore_from(directory, restore_trigger): + def _save_to(directory, step): + print('saving to {} ...'.format(directory)) + saved_path = saver.save(sess, directory, global_step=step) + + for trigger_name in ['convergence_trigger', 'annealing_trigger']: + trigger = globals()[trigger_name] + trigger_path = '{}.{}'.format(saved_path, trigger_name) + print('saving {} ...'.format(trigger_name)) + with open(trigger_path, 'wb') as pickle_file: + trigger.save_to_pickle(pickle_file) + + print('saved to {}'.format(saved_path)) + + def _restore_from(directory, restore_trigger_names): if os.path.exists(directory): ckpt_path = tf.train.latest_checkpoint(directory) print('restoring from {} ...'.format(ckpt_path)) saver.restore(sess, ckpt_path) - if restore_trigger: - trigger_path = '{}.trigger'.format(ckpt_path) + for trigger_name in restore_trigger_names: + trigger = globals()[trigger_name] + trigger_path = '{}.{}'.format(ckpt_path, trigger_name) if os.path.exists(trigger_path): + print('restoring {} ...'.format(trigger_name)) with open(trigger_path, 'rb') as pickle_file: trigger.restore_from_pickle(pickle_file) else: - print('cannot find previous trigger state.') + print('cannot find previous {} state.'.format(trigger_name)) print('done.') else: print('cannot find checkpoint directory {}'.format(directory)) - def _train_epoch(sess, summary_writer, mode, train_op, summary_op, trigger): + def _train_epoch(sess, summary_writer, mode, train_op, summary_op): print('in _train_epoch') data_iterator.restart_dataset(sess, mode) feed_dict = { tx.global_mode(): tf.estimator.ModeKeys.TRAIN, data_iterator.handle: data_iterator.get_handle(sess, mode), - mask_pattern_[0]: mask_pattern[0], - mask_pattern_[1]: mask_pattern[1], } + if mask_pattern is not None: + feed_dict.update( + {mask_pattern_[_]: mask_pattern[_] for _ in range(2)}) while True: try: @@ -239,7 +268,7 @@ def _train_epoch(sess, summary_writer, mode, train_op, summary_op, trigger): if step % config_train.steps_per_eval == 0: global triggered - _eval_epoch(sess, summary_writer, 'val', trigger) + _eval_epoch(sess, summary_writer, 'val') if triggered: break @@ -248,7 +277,7 @@ def _train_epoch(sess, summary_writer, mode, train_op, summary_op, trigger): print('end _train_epoch') - def _eval_epoch(sess, summary_writer, mode, trigger): + def _eval_epoch(sess, summary_writer, mode): print('in _eval_epoch with mode {}'.format(mode)) data_iterator.restart_dataset(sess, mode) @@ -290,21 +319,16 @@ def _eval_epoch(sess, summary_writer, mode, trigger): summary_writer.flush() if mode == 'val': - if trigger is not None: - if (trigger.best_ever_score is not None and - bleu > trigger.best_ever_score): - print('update best val bleu: {}'.format(bleu)) - - saved_path = saver.save(sess, ckpt_best, global_step=step) - with open('{}.trigger'.format(saved_path), 'wb') as \ - pickle_file: - trigger.save_to_pickle(pickle_file) - print('saved to {}'.format(saved_path)) - - global triggered - triggered, _ = trigger(step, bleu) - if triggered: - print('triggered!') + global triggered + triggered = convergence_trigger(step, bleu) + if triggered: + print('triggered!') + + if convergence_trigger.best_ever_step == step: + print('updated best val bleu: {}'.format( + convergence_trigger.best_ever_score)) + + _save_to(ckpt_best, step) print('end _eval_epoch') return bleu @@ -314,43 +338,42 @@ def _eval_epoch(sess, summary_writer, mode, trigger): sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) - dir_model = os.path.join(expr_name, 'ckpt') - dir_best = os.path.join(expr_name, 'ckpt-best') - ckpt_model = os.path.join(dir_model, 'model.ckpt') - ckpt_best = os.path.join(dir_best, 'model.ckpt') - - def action_before_phase(phase): - global train_data_name, train_op_name, mask_pattern,\ - train_op, summary_op - train_data_name, train_op_name, mask_pattern = phase - train_op = train_ops[train_op_name] - summary_op = summary_ops[train_op_name] + def action(i): + if i >= len(phases): + return i + i += 1 + train_data_name, train_op_name, mask_pattern = phases[i] if reinitialize: sess.run(train_op_initializers[train_op_name]) + return i - action = (action_before_phase(phase) for phase in phases) - next(action) - trigger = tx.utils.BestEverConvergenceTrigger( - action, - config_train.threshold_steps, - config_train.minimum_interval_steps, - default=None) + global annealing_trigger + annealing_trigger = tx.utils.Trigger(0, action) - _restore_from(dir_model, restore_trigger=True) + def _restore_and_anneal(): + _restore_from(dir_best, ['convergence_trigger']) + annealing_trigger.trigger() + + _restore_from(dir_model, ['convergence_trigger', 'annealing_trigger']) summary_writer = tf.summary.FileWriter( os.path.join(expr_name, 'log'), sess.graph, flush_secs=30) epoch = 0 while epoch < config_train.max_epochs: + train_data_name, train_op_name, mask_pattern = phases[ + annealing_trigger.user_state] + train_op = train_ops[train_op_name] + summary_op = summary_ops[train_op_name] + print('epoch #{} {}:'.format( epoch, (train_data_name, train_op_name, mask_pattern))) - val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger) + val_bleu = _eval_epoch(sess, summary_writer, 'val') + test_bleu = _eval_epoch(sess, summary_writer, 'test') if triggered: - _restore_from(dir_best, restore_trigger=False) - - test_bleu = _eval_epoch(sess, summary_writer, 'test', None) + _restore_and_anneal() + continue step = tf.train.global_step(sess, global_step) @@ -358,20 +381,17 @@ def action_before_phase(phase): epoch, step, val_bleu, test_bleu)) _train_epoch(sess, summary_writer, train_data_name, - train_op, summary_op, trigger) + train_op, summary_op) if triggered: - _restore_from(dir_best, restore_trigger=False) + _restore_and_anneal() + continue epoch += 1 step = tf.train.global_step(sess, global_step) - saved_path = saver.save(sess, ckpt_model, global_step=step) - with open('{}.trigger'.format(saved_path), 'wb') as pickle_file: - trigger.save_to_pickle(pickle_file) - - print('saved to {}'.format(saved_path)) + _save_to(ckpt_model, step) - test_bleu = _eval_epoch(sess, summary_writer, 'test', None) + test_bleu = _eval_epoch(sess, summary_writer, 'test') print('epoch: {}, test BLEU: {}'.format(epoch, test_bleu)) diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py index ba1ab0b5..a4128ba2 100644 --- a/texar/utils/triggers.py +++ b/texar/utils/triggers.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Attentional Seq2seq. +"""Triggers. """ from __future__ import absolute_import from __future__ import print_function @@ -33,48 +33,52 @@ ] -DEFAULT_ACTION = object() - - class Trigger(object): - """This is the base class of all triggers. A trigger can do some action when - certain condition is met. Specifically, the user calls the trigger - periodically. Every time the trigger is called, it will send all arguments - to :meth:`_predicate`, which returns a boolean value indicates whether the - condition is met. Once the condition is met, the trigger will then call - `next(action)` to do next action and obtain the returned value. + """This is the base class of all triggers. A trigger maintains some + user-defined :attr:`user_state` and does some :attr:`action` when certain + condition is met. Specifically, the user calls the trigger periodically. + Every time the trigger is called, it will send all arguments to + :meth:`_predicate`, which returns a boolean value indicates whether the + condition is met. Once the condition is met, the trigger will then execute + `user_state = action(user_state)` to update the :attr:`user_state`. + :attr:`user_state` should completely define the current state of the + trigger, and, therefore, enables saving and restoring :attr:`user_state`. + It is the user's responsibility to keep :attr:`action` away from any + possible corruption of restored state. Args: - action (iterable): An iterable which iteratively does the action and - possibly returns a value. - default (optional): The value returned after :attr:`action` exhausted. - If not provided, the trigger will do nothing when `StopIteration` - occurs. + initial_user_state: A (any kind of picklable) object representing the + initial :attr:`user_state`. + action (function): A function which is called to update + :attr:`user_state` every time the trigger is triggered. See above + for detailed explanation. + .. document private functions + .. automethod:: __call__ """ - def __init__(self, action, default=DEFAULT_ACTION): - self._action = iter(action) - self._default = default - self._triggered_times = 0 + def __init__(self, initial_user_state, action): + self._user_state = initial_user_state + self._action = action def _predicate(self, *args, **kwargs): - """This function returns True when the condition is met and we should - do something. + """Returns True when the condition is met and we should do something. """ raise NotImplementedError - def _next_action(self): - return next(self._action) if self._default is DEFAULT_ACTION else \ - next(self._action, self._default) + def trigger(self): + """Executes `user_state = action(user_state)`. User can manually call + this method to trigger it. + """ + self._user_state = self._action(self._user_state) def __call__(self, *args, **kwargs): + """The trigger must be called to update the internal state and + automatically triggers when the condition is found met. + """ pred = self._predicate(*args, **kwargs) if pred: - ret = self._next_action() - self._triggered_times += 1 - else: - ret = None - return pred, ret + self.trigger() + return pred def _make_state(self, names): return {name: getattr(self, name) for name in names} @@ -84,31 +88,31 @@ def _state_names(self): """Returns a list of names of attributes of the trigger object that can be saved and restored as trigger state. """ - return ['_triggered_times'] + return ['_user_state'] @property def state(self): """The current state which can be used to save and restore the trigger. - The state records how many times `next(action)` has been called. + The state is consisted of the internal state used to determine whether + the condition is met, and the user-defined :attr:`user_state`. """ return self._make_state(self._state_names) + @property + def user_state(self): + """The user-defined :attr:`user_state`. + """ + return self._user_state + def restore_from_state(self, state): - """Restore the trigger state from the previous stored state. - Note that this function will call `next(action)` for the exact times - that the :py:attr:`state` records how many times `next(action)` had - been called. The user should be aware of any possible side effect of - this behavior. + """Restore the trigger state from the previous saved state. Args: - state: The state previously obtained by :py:attr:`state`. + state: The state previously obtained by :attr:`state`. """ for name, value in state.items(): setattr(self, name, value) - for t in range(self._triggered_times): - self._next_action() - def save_to_pickle(self, file): """Write a pickled representation of the state of the trigger to the open file-like object :attr:`file`. @@ -123,10 +127,6 @@ def save_to_pickle(self, file): def restore_from_pickle(self, file): """Read a string from the open file-like object :attr:`file` and restore the trigger state from it. - Note that this function will call `next(action)` for the exact times - that the :py:attr:`state` records how many times `next(action)` had - been called. The user should be aware of any possible side effect of - this behavior. Args: file: The open file-like object from which we read. As described in @@ -138,12 +138,14 @@ def restore_from_pickle(self, file): class ScheduledStepsTrigger(Trigger): + """A trigger that triggers at designated steps. + """ - def __init__(self, action, steps, default=DEFAULT_ACTION): - """steps should be in increasing order. + def __init__(self, initial_user_state, action, steps): + """steps should be a list or tuple in increasing order. """ - super(ScheduledTrigger, self).__init__(action, default) - self._steps = iter(steps) + super(ScheduledTrigger, self).__init__(initial_user_state, action) + self._steps = steps self._advance_steps() def _advance_steps(self): @@ -165,8 +167,10 @@ class BestEverConvergenceTrigger(Trigger): triggers. Args: - action (iterable): An iterable which iteratively does the action and - possibly returns a value. + initial_user_state: A (any kind of picklable) object representing the + initial :attr:`user_state`. + action (function): A function which is called to update + :attr:`user_state` every time the trigger is triggered. threshold_steps (int): Number of steps it should trigger after the best value was last updated. minimum_interval_steps (int): Minimum number of steps between twice @@ -178,9 +182,10 @@ class BestEverConvergenceTrigger(Trigger): .. automethod:: __call__ """ - def __init__(self, action, threshold_steps, minimum_interval_steps, - default=DEFAULT_ACTION): - super(BestEverConvergenceTrigger, self).__init__(action, default) + def __init__(self, initial_user_state, action, threshold_steps, + minimum_interval_steps): + super(BestEverConvergenceTrigger, self).__init__( + initial_user_state, action) self._threshold_steps = threshold_steps self._minimum_interval_steps = minimum_interval_steps self._last_triggered_step = None @@ -243,9 +248,10 @@ def best_ever_score(self): class MovingAverageConvergenceTrigger(Trigger): - def __init__(self, action, n, threshold, minimum_interval_steps, - default=DEFAULT_ACTION): - super(MovingAverageConvergenceTrigger, self).__init__(action, default) + def __init__(self, initial_user_state, action, n, threshold, + minimum_interval_steps): + super(MovingAverageConvergenceTrigger, self).__init__( + initial_user_state, action) self._n = n self._threshold = threshold self._minimum_interval_steps = minimum_interval_steps From 2b1fe5a325b0028e5043f25ad3348fcc937e9513 Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Mon, 5 Nov 2018 03:54:05 +0000 Subject: [PATCH 58/65] add test units for triggers --- texar/utils/triggers_test.py | 68 ++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 texar/utils/triggers_test.py diff --git a/texar/utils/triggers_test.py b/texar/utils/triggers_test.py new file mode 100644 index 00000000..a3f88a20 --- /dev/null +++ b/texar/utils/triggers_test.py @@ -0,0 +1,68 @@ +""" +Unit tests for triggers. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +import random + +from texar.utils.triggers import Trigger, BestEverConvergenceTrigger + + +class TriggerTest(tf.test.TestCase): + """Tests :class:`~texar.utils.Trigger`. + """ + + def test(self): + trigger = Trigger(0, lambda x: x+1) + for step in range(100): + trigger.trigger() + self.assertEqual(trigger.user_state, step+1) + +class BestEverConvergenceTriggerTest(tf.test.TestCase): + """Tests :class:`~texar.utils.BestEverConvergenceTrigger`. + """ + + def test(self): + for i in range(100): + n = random.randint(1, 100) + seq = list(range(n)) + random.shuffle(seq) + threshold_steps = random.randint(0, n // 2 + 1) + minimum_interval_steps = random.randint(0, n // 2 + 1) + trigger = BestEverConvergenceTrigger( + 0, lambda x: x+1, threshold_steps, minimum_interval_steps) + + best_ever_step, best_ever_score, last_triggered_step = -1, -1, None + + for step, score in enumerate(seq): + if score > best_ever_score: + best_ever_step = step + best_ever_score = score + + triggered_ = step - best_ever_step >= threshold_steps and \ + (last_triggered_step is None or + step - last_triggered_step >= minimum_interval_steps) + if triggered_: + last_triggered_step = step + + triggered = trigger(step, score) + + self.assertEqual(trigger.best_ever_step, best_ever_step) + self.assertEqual(trigger.best_ever_score, best_ever_score) + self.assertEqual(trigger.last_triggered_step, + last_triggered_step) + self.assertEqual(triggered, triggered_) + + trigger = BestEverConvergenceTrigger(0, lambda x: x+1, 0, 0) + for step in range(100): + trigger.trigger() + self.assertEqual(trigger.user_state, step+1) + + +if __name__ == "__main__": + tf.test.main() + From ec20a9eac5e01fc1582b5845dd60c473dedefdaf Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Mon, 5 Nov 2018 05:49:03 +0000 Subject: [PATCH 59/65] rewrite ScheduledStepsTrigger; correct and refine some docs TODO: 1. test ScheduledStepsTrigger; 2. test docs. --- texar/utils/triggers.py | 79 ++++++++++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 12 deletions(-) diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py index a4128ba2..f209f377 100644 --- a/texar/utils/triggers.py +++ b/texar/utils/triggers.py @@ -29,6 +29,7 @@ __all__ = [ "Trigger", + "ScheduledStepsTrigger", "BestEverConvergenceTrigger", ] @@ -58,6 +59,8 @@ class Trigger(object): def __init__(self, initial_user_state, action): self._user_state = initial_user_state + if not callable(action): + raise ValueError("Action {} is not callable".format(action)) self._action = action def _predicate(self, *args, **kwargs): @@ -74,6 +77,9 @@ def trigger(self): def __call__(self, *args, **kwargs): """The trigger must be called to update the internal state and automatically triggers when the condition is found met. + + Returns: + A boolean denotes whether triggered this time. """ pred = self._predicate(*args, **kwargs) if pred: @@ -138,7 +144,31 @@ def restore_from_pickle(self, file): class ScheduledStepsTrigger(Trigger): - """A trigger that triggers at designated steps. + """A trigger that triggers after the training step have iterated over some + user-designated steps. This means that it will trigger if there is at least + one `step` in user-designated set of :attr:`steps` within the range + `(last_called_step, current_step]`. + + Args: + initial_user_state: A (any kind of picklable) object representing the + initial :attr:`user_state`. + action (function): A function which is called to update + :attr:`user_state` every time the trigger is triggered. + steps (list, tuple, or callable): Represents the user-designated set of + :attr:`steps` described above. There are **2 ways** provided to specify + this set: + + 1. :attr:`steps` is a callable. When calling + `steps(last_called_step, current_step)`, it is assumed to return + a boolean indicating whether there is at least one `step` in the set + within the range `(last_called_step, current_step]`. For example, + :code:`steps = lambda l, r: l // n != r // n` denotes the set + `{i * n for any positive integer i}` where `n` is some positive + integer. This option enables user to define any set of steps, even + an infinite set. + + 2. :attr:`steps` is a `list` or `tuple` containing numbers in ascending + order. These numbers compose the whole set. """ def __init__(self, initial_user_state, action, steps): @@ -146,17 +176,44 @@ def __init__(self, initial_user_state, action, steps): """ super(ScheduledTrigger, self).__init__(initial_user_state, action) self._steps = steps - self._advance_steps() - def _advance_steps(self): - self._next_step = next(step, None) + if callable(self._steps): + self._last_called_step = None + + else: + self._index = 0 + + @property + def _state_names(self): + return super(ScheduledStepsTrigger, self)._state_names + [ + '_last_called_step' if callable(self._steps) else '_index'] def _predicate(self, step): - while self._next_step is not None and step < self._next_step: - self._advance_steps() - if self._next_step is not None and step == self._next_step: - return True - return False + if callable(self._steps): + ret = self._steps(self._last_called_step, step) + self._last_call_step = step + + else: + ret = False + while self._index < len(self._steps) and \ + self._steps[self._index] <= step: + ret = True + self._index += 1 + + return ret + + def __call__(self, step): + """The trigger must be called to update the current training step + (:attr:`step`). + + Args: + step (int): Current training step to update. The training step must + be updated in ascending order. + + Returns: + A boolean denotes whether triggered this time. + """ + return super(ScheduledStepsTrigger, self).__call__(step) class BestEverConvergenceTrigger(Trigger): @@ -216,9 +273,7 @@ def __call__(self, step, score): score (float): Current value of the maintained metric. Returns: - A tuple `(triggered, retval)`, where boolean `triggered` denotes - whether triggered this time and `retval` is the return value of the - action performed this time. + A boolean denotes whether triggered this time. """ return super(BestEverConvergenceTrigger, self).__call__(step, score) From ad56c3effb17bfc8a279afb9e46a7df6df6c571e Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Mon, 5 Nov 2018 11:27:09 -0500 Subject: [PATCH 60/65] fix final annealing bug --- .../differentiable_expected_bleu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 6c567fe3..218e0b3b 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -339,7 +339,7 @@ def _eval_epoch(sess, summary_writer, mode): sess.run(tf.tables_initializer()) def action(i): - if i >= len(phases): + if i >= len(phases) - 1: return i i += 1 train_data_name, train_op_name, mask_pattern = phases[i] From 1f3e21278c21336a1f21cdb533caaa30123d074f Mon Sep 17 00:00:00 2001 From: Zichao Yang Date: Mon, 5 Nov 2018 14:08:41 -0500 Subject: [PATCH 61/65] add config restore_from --- .../differentiable_expected_bleu.py | 45 ++++++++++++------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index 218e0b3b..ac6573b0 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -35,6 +35,9 @@ flags.DEFINE_string("config_train", "config_train", "The training config.") flags.DEFINE_string("expr_name", "iwslt14_de-en", "The experiment name. " "Used as the directory name of run.") +flags.DEFINE_string("restore_from", "", "The specific checkpoint path to " + "restore from. If not specified, the latest checkpoint in " + "expr_name is restored.") flags.DEFINE_boolean("reinitialize", True, "Whether to reinitialize the state " "of the optimizers before training and after triggering.") @@ -44,6 +47,7 @@ config_data = importlib.import_module(FLAGS.config_data) config_train = importlib.import_module(FLAGS.config_train) expr_name = FLAGS.expr_name +restore_from = FLAGS.restore_from reinitialize = FLAGS.reinitialize phases = config_train.phases @@ -226,23 +230,29 @@ def _save_to(directory, step): print('saved to {}'.format(saved_path)) - def _restore_from(directory, restore_trigger_names): - if os.path.exists(directory): - ckpt_path = tf.train.latest_checkpoint(directory) - print('restoring from {} ...'.format(ckpt_path)) - saver.restore(sess, ckpt_path) + def _restore_from_path(ckpt_path, restore_trigger_names=None): + print('restoring from {} ...'.format(ckpt_path)) + saver.restore(sess, ckpt_path) + + if restore_trigger_names is None: + restore_trigger_names = ['convergence_trigger', 'annealing_trigger'] + + for trigger_name in restore_trigger_names: + trigger = globals()[trigger_name] + trigger_path = '{}.{}'.format(ckpt_path, trigger_name) + if os.path.exists(trigger_path): + print('restoring {} ...'.format(trigger_name)) + with open(trigger_path, 'rb') as pickle_file: + trigger.restore_from_pickle(pickle_file) + else: + print('cannot find previous {} state.'.format(trigger_name)) - for trigger_name in restore_trigger_names: - trigger = globals()[trigger_name] - trigger_path = '{}.{}'.format(ckpt_path, trigger_name) - if os.path.exists(trigger_path): - print('restoring {} ...'.format(trigger_name)) - with open(trigger_path, 'rb') as pickle_file: - trigger.restore_from_pickle(pickle_file) - else: - print('cannot find previous {} state.'.format(trigger_name)) + print('done.') - print('done.') + def _restore_from(directory, restore_trigger_names=None): + if os.path.exists(directory): + ckpt_path = tf.train.latest_checkpoint(directory) + _restore_from_path(ckpt_path, restore_trigger_names) else: print('cannot find checkpoint directory {}'.format(directory)) @@ -354,7 +364,10 @@ def _restore_and_anneal(): _restore_from(dir_best, ['convergence_trigger']) annealing_trigger.trigger() - _restore_from(dir_model, ['convergence_trigger', 'annealing_trigger']) + if restore_from: + _restore_from_path(restore_from) + else: + _restore_from(dir_model) summary_writer = tf.summary.FileWriter( os.path.join(expr_name, 'log'), sess.graph, flush_secs=30) From 8988209feee68073431ee9fbe63cb857b9cd2ca3 Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Tue, 6 Nov 2018 02:10:32 +0000 Subject: [PATCH 62/65] add test units for ScheduledStepsTrigger and fix some bugs --- texar/utils/triggers.py | 23 ++++++++++---- texar/utils/triggers_test.py | 59 +++++++++++++++++++++++++++++++++++- 2 files changed, 75 insertions(+), 7 deletions(-) diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py index f209f377..1553393b 100644 --- a/texar/utils/triggers.py +++ b/texar/utils/triggers.py @@ -165,16 +165,17 @@ class ScheduledStepsTrigger(Trigger): :code:`steps = lambda l, r: l // n != r // n` denotes the set `{i * n for any positive integer i}` where `n` is some positive integer. This option enables user to define any set of steps, even - an infinite set. + an infinite set. Note that in this case the trigger will never + trigger when being called for the first time, because + `last_called_step` is undefined at this time. User can manually call + it to specify an initial step before training. 2. :attr:`steps` is a `list` or `tuple` containing numbers in ascending order. These numbers compose the whole set. """ def __init__(self, initial_user_state, action, steps): - """steps should be a list or tuple in increasing order. - """ - super(ScheduledTrigger, self).__init__(initial_user_state, action) + super(ScheduledStepsTrigger, self).__init__(initial_user_state, action) self._steps = steps if callable(self._steps): @@ -188,10 +189,20 @@ def _state_names(self): return super(ScheduledStepsTrigger, self)._state_names + [ '_last_called_step' if callable(self._steps) else '_index'] + @property + def last_called_step(self): + """The step when the trigger is latest called. + """ + return self._last_called_step + def _predicate(self, step): if callable(self._steps): - ret = self._steps(self._last_called_step, step) - self._last_call_step = step + if self._last_called_step is not None: + ret = self._steps(self._last_called_step, step) + else: + ret = False + + self._last_called_step = step else: ret = False diff --git a/texar/utils/triggers_test.py b/texar/utils/triggers_test.py index a3f88a20..979b95ed 100644 --- a/texar/utils/triggers_test.py +++ b/texar/utils/triggers_test.py @@ -8,8 +8,9 @@ import tensorflow as tf import random +import bisect -from texar.utils.triggers import Trigger, BestEverConvergenceTrigger +from texar.utils.triggers import * class TriggerTest(tf.test.TestCase): @@ -22,6 +23,62 @@ def test(self): trigger.trigger() self.assertEqual(trigger.user_state, step+1) + +class ScheduledStepsTriggerTest(tf.test.TestCase): + """Tests :class:`~texar.utils.ScheduledStepsTrigger`. + """ + + def test(self): + for i in range(100): + n = random.randint(1, 100) + m = random.randint(1, n) + p = random.uniform(0, 0.3) + f = lambda l, r: l // n != r // n + trigger = ScheduledStepsTrigger(0, lambda x: x+1, f) + + last_called_step = None + + for step in range(n): + if random.random() < p: + if last_called_step is not None: + triggered_ = f(last_called_step, step) + else: + triggered_ = False + + last_called_step = step + + triggered = trigger(step) + + self.assertEqual(trigger.last_called_step, last_called_step) + self.assertEqual(triggered, triggered_) + + for i in range(100): + n = random.randint(1, 100) + m = random.randint(1, n) + p = random.uniform(0, 0.3) + q = random.uniform(0, 0.3) + steps = [step for step in range(n) if random.random() < q] + f = lambda l, r: bisect.bisect_right(steps, l) < \ + bisect.bisect_right(steps, r) + trigger = ScheduledStepsTrigger(0, lambda x: x+1, steps) + + last_called_step = -1 + + for step in range(n): + if random.random() < p: + triggered_ = f(last_called_step, step) + last_called_step = step + + triggered = trigger(step) + + self.assertEqual(triggered, triggered_) + + trigger = ScheduledStepsTrigger(0, lambda x: x+1, []) + for step in range(100): + trigger.trigger() + self.assertEqual(trigger.user_state, step+1) + + class BestEverConvergenceTriggerTest(tf.test.TestCase): """Tests :class:`~texar.utils.BestEverConvergenceTrigger`. """ From 5851220137cc060d83c6357a501cdcff0b47b9a3 Mon Sep 17 00:00:00 2001 From: wwt Date: Mon, 5 Nov 2018 21:37:43 -0500 Subject: [PATCH 63/65] fix docs for triggers --- docs/code/utils.rst | 5 +++++ texar/utils/triggers.py | 48 ++++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/docs/code/utils.rst b/docs/code/utils.rst index c463c752..726d4739 100644 --- a/docs/code/utils.rst +++ b/docs/code/utils.rst @@ -287,6 +287,11 @@ Trigger .. autoclass:: texar.utils.Trigger :members: +:hidden:`ScheduledStepsTrigger` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: texar.utils.ScheduledStepsTrigger + :members: + :hidden:`BestEverConvergenceTrigger` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: texar.utils.BestEverConvergenceTrigger diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py index 1553393b..ee50c341 100644 --- a/texar/utils/triggers.py +++ b/texar/utils/triggers.py @@ -50,7 +50,7 @@ class Trigger(object): Args: initial_user_state: A (any kind of picklable) object representing the initial :attr:`user_state`. - action (function): A function which is called to update + action (callable): A callable which is called to update :attr:`user_state` every time the trigger is triggered. See above for detailed explanation. .. document private functions @@ -149,29 +149,32 @@ class ScheduledStepsTrigger(Trigger): one `step` in user-designated set of :attr:`steps` within the range `(last_called_step, current_step]`. + There are **2 ways** provided to specify the set of :attr:`steps`: + + 1. :attr:`steps` is a callable. When calling + `steps(last_called_step, current_step)`, it is assumed to return + a boolean indicating whether there is at least one `step` in the set + within the range `(last_called_step, current_step]`. For example, + :code:`steps = lambda l, r: l // n != r // n` denotes the set + `{i * n for any integer i}` where `n` is some integer. This option + enables user to define any set of steps, even an infinite set. Note + that in this case the trigger will never trigger when being called + for the first time, because `last_called_step` is undefined at this + time. User can manually call it to specify an initial step before + training. + + 2. :attr:`steps` is a `list` or `tuple` containing numbers in ascending + order. These numbers compose the whole set. + Args: initial_user_state: A (any kind of picklable) object representing the initial :attr:`user_state`. - action (function): A function which is called to update + action (callable): A callable which is called to update :attr:`user_state` every time the trigger is triggered. steps (list, tuple, or callable): Represents the user-designated set of - :attr:`steps` described above. There are **2 ways** provided to specify - this set: - - 1. :attr:`steps` is a callable. When calling - `steps(last_called_step, current_step)`, it is assumed to return - a boolean indicating whether there is at least one `step` in the set - within the range `(last_called_step, current_step]`. For example, - :code:`steps = lambda l, r: l // n != r // n` denotes the set - `{i * n for any positive integer i}` where `n` is some positive - integer. This option enables user to define any set of steps, even - an infinite set. Note that in this case the trigger will never - trigger when being called for the first time, because - `last_called_step` is undefined at this time. User can manually call - it to specify an initial step before training. - - 2. :attr:`steps` is a `list` or `tuple` containing numbers in ascending - order. These numbers compose the whole set. + :attr:`steps` described above. + .. document private functions + .. automethod:: __call__ """ def __init__(self, initial_user_state, action, steps): @@ -237,15 +240,12 @@ class BestEverConvergenceTrigger(Trigger): Args: initial_user_state: A (any kind of picklable) object representing the initial :attr:`user_state`. - action (function): A function which is called to update + action (callable): A callable which is called to update :attr:`user_state` every time the trigger is triggered. threshold_steps (int): Number of steps it should trigger after the best value was last updated. minimum_interval_steps (int): Minimum number of steps between twice firing of the trigger. - default (optional): The value returned after :attr:`action` exhausted. - If not provided, the trigger will do nothing when `StopIteration` - occurs. .. document private functions .. automethod:: __call__ """ @@ -281,7 +281,7 @@ def __call__(self, step, score): Args: step (int): Current training step to update. The training step must be updated in ascending order. - score (float): Current value of the maintained metric. + score (float or int): Current value of the maintained metric. Returns: A boolean denotes whether triggered this time. From 8fdf62ee9d7f54c6fe571598225eafc5e7a532ad Mon Sep 17 00:00:00 2001 From: wwt Date: Mon, 5 Nov 2018 21:44:12 -0500 Subject: [PATCH 64/65] remove unfinished MovingAverageConvergenceTrigger --- texar/utils/triggers.py | 42 ----------------------------------------- 1 file changed, 42 deletions(-) diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py index ee50c341..d4aefdaf 100644 --- a/texar/utils/triggers.py +++ b/texar/utils/triggers.py @@ -310,45 +310,3 @@ def best_ever_score(self): """The best-ever score. """ return self._best_ever_score - - -class MovingAverageConvergenceTrigger(Trigger): - - def __init__(self, initial_user_state, action, n, threshold, - minimum_interval_steps): - super(MovingAverageConvergenceTrigger, self).__init__( - initial_user_state, action) - self._n = n - self._threshold = threshold - self._minimum_interval_steps = minimum_interval_steps - self._last_triggered_step = None - self._head_queue = queue.Queue(self._n) - self._head_sum = 0 - self._rear_queue = queue.Queue(self._n) - self._rear_sum = 0 - - def _predicate(self, step, score): - if self._head_queue.full(): - e = self._head_queue.get() - self._head_sum -= e - if self._rear_queue.full(): - self._rear_sum -= self._rear_queue.get() - self._rear_queue.put(e) - self._rear_sum += e - self._head_queue.put(score) - self._head_sum += score - - if (self._last_triggered_step is None or - step - self._last_triggered_step - >= self._minimum_interval_steps) and \ - self._head_queue.full() and self._rear_queue.full() and \ - self._head_sum - self._rear_sum <= self._n * self._threshold: - self._last_triggered_step = step - return True - return False - - @property - def _state_names(self): - return super(BestEverConvergenceTrigger, self)._state_names + [ - '_last_triggered_step', '_head_queue', '_head_sum', '_rear_queue', - '_rear_sum'] From 3b588830c15d3161d4833a2618aec9d934abe7bf Mon Sep 17 00:00:00 2001 From: wwt Date: Mon, 5 Nov 2018 22:23:25 -0500 Subject: [PATCH 65/65] update README.md --- .../differentiable_expected_bleu/README.md | 27 ++++++++++++++----- .../differentiable_expected_bleu.py | 2 +- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/examples/differentiable_expected_bleu/README.md b/examples/differentiable_expected_bleu/README.md index ad5d685c..5bff077d 100644 --- a/examples/differentiable_expected_bleu/README.md +++ b/examples/differentiable_expected_bleu/README.md @@ -8,7 +8,7 @@ This example builds an attentional seq2seq model for machine translation trained Download the data with the following cmds: -``` +```bash python prepare_data.py --data de-en ``` @@ -16,18 +16,33 @@ python prepare_data.py --data de-en Train the model with the following cmd: -``` -python differentiable_expected_bleu.py --config_model config_model --config_data config_iwslt14_de-en --config_train config_train_iwslt14_de-en --pretrain_epochs 8 +```bash +python differentiable_expected_bleu.py --config_model config_model_medium --config_data config_data_iwslt14_de-en --config_train config_train --expr_name iwslt14_de-en --restore_from "" --reinitialize ``` Here: * `--config_model` specifies the model config. Note not to include the `.py` suffix. * `--config_data` specifies the data config. * `--config_train` specifies the training config. - * `--pretrain_epochs` specifies the number of epochs to pretrain with cross-entropy loss. + * `--expr_name` specifies the experiment name. Used as the directory name to save and restore all information. + * `--restore_from` specifies the checkpoint path to restore from. If not specified (or an empty string is specified), the latest checkpoint in `expr_name` is restored. + * `--reinitialize` is a flag indicates whether to reinitialize the state of the optimizers before training and after annealing. Default is enabled. + +[config_model_medium.py](./config_model_medium.py) specifies a single-layer seq2seq model with Luong attention and bi-directional RNN encoder. -[config_model.py](./config_model.py) specifies a single-layer seq2seq model with Luong attention and bi-directional RNN encoder. Hyperparameters taking default values can be omitted from the config file. +[config_model_large.py](./config_model_large.py) specifies a seq2seq model with Luong attention, 2-layer bi-directional RNN encoder, single-layer RNN decoder, and a connector between the final state of the encoder and the initial state of the decoder. The size of this model is quite large. + +[config_data_iwslt14_de-en.py](./config_data_iwslt14_de-en.py) specifies the IWSLT'14 German-English dataset. + +[config_train.py](./config_train.py) specifies the training (including annealing) configs. ## Results ## -On the IWSLT14 dataset, the model achieves `BLEU=25.35` after annealed all masks, while the cross-entropy trained model achieves `BLEU=24.57`. +On the IWSLT'14 German-English dataset, we ran both configs for 4~5 times. Here are the average BLEU scores attained: + +| config | inference beam size | Cross-Entropy baseline | DEBLEU | improvement | +| :------------------------------------------------: | :-----------------: | :--------------------: | :----: | :---------: | +| [config_model_medium.py](./config_model_medium.py) | 1 | 26.12 | 27.40 | 1.28 | +| [config_model_medium.py](./config_model_medium.py) | 5 | 27.03 | 27.72 | 0.70 | +| [config_model_large.py](./config_model_large.py) | 1 | 25.24 | 26.47 | 1.23 | +| [config_model_large.py](./config_model_large.py) | 5 | 26.33 | 26.87 | 0.54 | diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py index ac6573b0..0c414b21 100755 --- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py +++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py @@ -39,7 +39,7 @@ "restore from. If not specified, the latest checkpoint in " "expr_name is restored.") flags.DEFINE_boolean("reinitialize", True, "Whether to reinitialize the state " - "of the optimizers before training and after triggering.") + "of the optimizers before training and after annealing.") FLAGS = flags.FLAGS