From d7e39102c7e634828b0288fec1f87a1e6631a7b9 Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Sun, 30 Sep 2018 19:47:22 -0400
Subject: [PATCH 01/65] add differentiable_expected_bleu loss

---
 .../differentiable_expected_bleu/README.md    |  40 ++++
 .../config_iwslt14.py                         |  45 +++++
 .../config_model.py                           |  29 +++
 .../config_model_full.py                      | 127 +++++++++++++
 .../config_train.py                           |   1 +
 .../differentiable_expected_bleu.py           | 173 ++++++++++++++++++
 .../prepare_data.py                           |  53 ++++++
 texar/losses/__init__.py                      |   1 +
 texar/losses/differentiable_expected_bleu.py  | 129 +++++++++++++
 9 files changed, 598 insertions(+)
 create mode 100644 examples/differentiable_expected_bleu/README.md
 create mode 100644 examples/differentiable_expected_bleu/config_iwslt14.py
 create mode 100644 examples/differentiable_expected_bleu/config_model.py
 create mode 100644 examples/differentiable_expected_bleu/config_model_full.py
 create mode 100644 examples/differentiable_expected_bleu/config_train.py
 create mode 100755 examples/differentiable_expected_bleu/differentiable_expected_bleu.py
 create mode 100644 examples/differentiable_expected_bleu/prepare_data.py
 create mode 100644 texar/losses/differentiable_expected_bleu.py
diff --git a/examples/differentiable_expected_bleu/README.md b/examples/differentiable_expected_bleu/README.md
new file mode 100644
index 00000000..9b481c5e
--- /dev/null
+++ b/examples/differentiable_expected_bleu/README.md
@@ -0,0 +1,40 @@
+# Seq2seq Model #
+
+This example builds an attentional seq2seq model for machine translation.
+
+## Usage ##
+
+### Dataset ###
+
+Two example datasets are provided:
+
+  * toy_copy: A small toy autoencoding dataset from [TF Seq2seq toolkit](https://github.com/google/seq2seq/tree/2500c26add91b079ca00cf1f091db5a99ddab9ae).
+  * iwslt14: The benchmark [IWSLT2014](https://sites.google.com/site/iwsltevaluation2014/home) (de-en) machine translation dataset. 
+
+Download the data with the following cmds:
+
+```
+python prepare_data.py --data toy_copy
+python prepare_data.py --data iwslt14
+```
+
+### Train the model ###
+
+Train the model with the following cmd:
+
+```
+python seq2seq_attn.py --config_model config_model --config_data config_toy_copy
+```
+
+Here:
+  * `--config_model` specifies the model config. Note not to include the `.py` suffix.
+  * `--config_data` specifies the data config.
+
+[config_model.py](./config_model.py) specifies a single-layer seq2seq model with Luong attention and bi-directional RNN encoder. Hyperparameters taking default values can be omitted from the config file. 
+
+For demonstration purpose, [config_model_full.py](./config_model_full.py) gives all possible hyperparameters for the model. The two config files will lead to the same model.
+
+## Results ##
+
+On the IWSLT14 dataset, using original target texts as reference(no  `<UNK>`  in the reference), the model achieves `BLEU=21.66` within `10` epochs.
+
diff --git a/examples/differentiable_expected_bleu/config_iwslt14.py b/examples/differentiable_expected_bleu/config_iwslt14.py
new file mode 100644
index 00000000..0c36dc73
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_iwslt14.py
@@ -0,0 +1,45 @@
+
+num_epochs = 15
+display = 500
+
+source_vocab_file = './data/iwslt14/vocab.de'
+target_vocab_file = './data/iwslt14/vocab.en'
+
+train = {
+    'batch_size': 32,
+    'allow_smaller_final_batch': False,
+    'source_dataset': {
+        "files": 'data/iwslt14/train.de',
+        'vocab_file': source_vocab_file,
+        'max_seq_length': 50
+    },
+    'target_dataset': {
+        'files': 'data/iwslt14/train.en',
+        'vocab_file': target_vocab_file,
+        'max_seq_length': 50
+    }
+}
+val = {
+    'batch_size': 32,
+    'shuffle': False,
+    'source_dataset': {
+        "files": 'data/iwslt14/valid.de',
+        'vocab_file': source_vocab_file,
+    },
+    'target_dataset': {
+        'files': 'data/iwslt14/valid.en',
+        'vocab_file': target_vocab_file,
+    }
+}
+test = {
+    'batch_size': 32,
+    'shuffle': False,
+    'source_dataset': {
+        "files": 'data/iwslt14/test.de',
+        'vocab_file': source_vocab_file,
+    },
+    'target_dataset': {
+        'files': 'data/iwslt14/test.en',
+        'vocab_file': target_vocab_file,
+    }
+}
diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model.py
new file mode 100644
index 00000000..8ef3c9b3
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_model.py
@@ -0,0 +1,29 @@
+# Attentional Seq2seq model.
+# Hyperparameters not specified here will take the default values.
+
+num_units = 256
+beam_width = 10
+
+embedder = {
+    'dim': num_units
+}
+encoder = {
+    'rnn_cell_fw': {
+        'kwargs': {
+            'num_units': num_units
+        }
+    }
+}
+decoder = {
+    'rnn_cell': {
+        'kwargs': {
+            'num_units': num_units
+        },
+    },
+    'attention': {
+        'kwargs': {
+            'num_units': num_units,
+        },
+        'attention_layer_size': num_units
+    }
+}
diff --git a/examples/differentiable_expected_bleu/config_model_full.py b/examples/differentiable_expected_bleu/config_model_full.py
new file mode 100644
index 00000000..b59ebc4e
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_model_full.py
@@ -0,0 +1,127 @@
+# The full possible hyperparameters for the attentional seq2seq model.
+# Most of the hyperparameters take the default values and are not necessary to
+# specify explicitly. The config here results in the same model with the
+# `config_model.py`.
+
+num_units = 256
+beam_width = 10
+
+# --------------------- Embedder --------------------- #
+embedder = {
+    'dim': num_units,
+    'initializer': {
+        'type': 'random_uniform_initializer',
+        'kwargs': {
+            'minval': -0.1,
+            'maxval': 0.1,
+            'seed': None
+        },
+    },
+    'regularizer': {
+        'type': 'L1L2',
+        'kwargs': {
+            'l1': 0,
+            'l2': 0
+        }
+    },
+    'dropout_rate': 0,
+    'dropout_strategy': 'element',
+    'trainable': True,
+    'name': 'word_embedder'
+}
+
+# --------------------- Encoder --------------------- #
+encoder = {
+    'rnn_cell_fw': {
+        'type': 'LSTMCell',
+        'kwargs': {
+            'num_units': num_units,
+            'forget_bias': 1.0,
+            'activation': None,
+            # Other arguments go here for tf.nn.rnn_cell.LSTMCell
+            # ...
+        },
+        'num_layers': 1,
+        'dropout': {
+            'input_keep_prob': 1.0,
+            'output_keep_prob': 1.0,
+            'state_keep_prob': 1.0,
+            'variational_recurrent': False,
+            'input_size': [],
+        },
+        'residual': False,
+        'highway': False,
+    },
+    'rnn_cell_bw': {
+        # The same possible hyperparameters as with 'rnn_cell_fw'
+        # ...
+    },
+    'rnn_cell_share_config': True,
+    'output_layer_fw': {
+        'num_layers': 0,
+        'layer_size': 128,
+        'activation': 'identity',
+        'final_layer_activation': None,
+        'other_dense_kwargs': None,
+        'dropout_layer_ids': [],
+        'dropout_rate': 0.5,
+        'variational_dropout': False
+    },
+    'output_layer_bw': {
+        # The same possible hyperparameters as with 'output_layer_fw'
+        # ...
+    },
+    'output_layer_share_config': True,
+    'name': 'bidirectional_rnn_encoder'
+}
+
+# --------------------- Decoder --------------------- #
+decoder = {
+    'rnn_cell': {
+        'type': 'LSTMCell',
+        'kwargs': {
+            'num_units': num_units,
+            'forget_bias': 1.0,
+            'activation': None,
+            # Other arguments go here for tf.nn.rnn_cell.LSTMCell
+            # ...
+        },
+        'num_layers': 1,
+        'dropout': {
+            'input_keep_prob': 1.0,
+            'output_keep_prob': 1.0,
+            'state_keep_prob': 1.0,
+            'variational_recurrent': False,
+            'input_size': [],
+        },
+        'residual': False,
+        'highway': False,
+    },
+    'attention': {
+        'type': 'LuongAttention',
+        'kwargs': {
+            'num_units': num_units,
+            'scale': False,
+            'probability_fn': None,
+            'score_mask_value': None,
+            # Other arguments go here for tf.contrib.seq2seq.LuongAttention
+            # ...
+        },
+        'attention_layer_size': num_units,
+        'alignment_history': False,
+        'output_attention': True,
+    },
+    'helper_train': {
+        'type': 'TrainingHelper',
+        'kwargs': {
+            # Arguments go here for tf.contrib.seq2seq.TrainingHelper
+        }
+    },
+    'helper_infer': {
+        # The same possible hyperparameters as with 'helper_train'
+        # ...
+    },
+    'max_decoding_length_train': None,
+    'max_decoding_length_infer': None,
+    'name': 'attention_rnn_decoder'
+}
diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py
new file mode 100644
index 00000000..e1b30a36
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_train.py
@@ -0,0 +1 @@
+tau = 1.
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
new file mode 100755
index 00000000..7ba581ec
--- /dev/null
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+# Copyright 2018 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Attentional Seq2seq.
+"""
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+#pylint: disable=invalid-name, too-many-arguments, too-many-locals
+
+import importlib
+import tensorflow as tf
+import texar as tx
+
+flags = tf.flags
+
+flags.DEFINE_string("config_train", "config_train", "The training config.")
+flags.DEFINE_string("config_model", "config_model", "The model config.")
+flags.DEFINE_string("config_data", "config_iwslt14", "The dataset config.")
+
+FLAGS = flags.FLAGS
+
+config_train = importlib.import_module(FLAGS.config_train)
+config_model = importlib.import_module(FLAGS.config_model)
+config_data = importlib.import_module(FLAGS.config_data)
+
+
+def build_model(batch, train_data):
+    """Assembles the seq2seq model.
+    """
+    source_embedder = tx.modules.WordEmbedder(
+        vocab_size=train_data.source_vocab.size, hparams=config_model.embedder)
+
+    encoder = tx.modules.BidirectionalRNNEncoder(
+        hparams=config_model.encoder)
+
+    enc_outputs, _ = encoder(source_embedder(batch['source_text_ids']))
+
+    target_embedder = tx.modules.WordEmbedder(
+        vocab_size=train_data.target_vocab.size, hparams=config_model.embedder)
+
+    decoder = tx.modules.AttentionRNNDecoder(
+        memory=tf.concat(enc_outputs, axis=2),
+        memory_sequence_length=batch['source_length'],
+        vocab_size=train_data.target_vocab.size,
+        hparams=config_model.decoder)
+
+    start_tokens = tf.ones_like(batch['target_length']) * \
+            train_data.target_vocab.bos_token_id
+    end_token = train_data.target_vocab.eos_token_id
+
+    helper = tx.modules.GumbelSoftmaxEmbeddingHelper(
+        embedding=target_embedder,
+        start_tokens=start_tokens,
+        end_token=end_token,
+        tau=config_train.tau)
+
+    training_outputs, _, _ = decoder(
+        helper=helper,
+        max_decoding_length=50)
+
+    train_op = tx.core.get_train_op(
+        tx.losses.differentiable_expected_bleu(
+            #TODO: decide whether to include BOS
+            labels=batch['target_text_ids'][:, 1:],
+            logits=training_outputs.logits,
+            sequence_length=batch['target_length'] - 1))
+
+    beam_search_outputs, _, _ = \
+        tx.modules.beam_search_decode(
+            decoder_or_cell=decoder,
+            embedding=target_embedder,
+            start_tokens=start_tokens,
+            end_token=end_token,
+            beam_width=config_model.beam_width,
+            max_decoding_length=50)
+
+    return train_op, beam_search_outputs
+
+
+def main():
+    """Entrypoint.
+    """
+    train_data = tx.data.PairedTextData(hparams=config_data.train)
+    val_data = tx.data.PairedTextData(hparams=config_data.val)
+    test_data = tx.data.PairedTextData(hparams=config_data.test)
+    data_iterator = tx.data.TrainTestDataIterator(
+        train=train_data, val=val_data, test=test_data)
+
+    batch = data_iterator.get_next()
+
+    train_op, infer_outputs = build_model(batch, train_data)
+
+    def _train_epoch(sess):
+        data_iterator.switch_to_train_data(sess)
+
+        step = 0
+        while True:
+            try:
+                loss = sess.run(train_op)
+                if step % config_data.display == 0:
+                    print("step={}, loss={:.4f}".format(step, loss))
+                step += 1
+            except tf.errors.OutOfRangeError:
+                break
+
+    def _eval_epoch(sess, mode):
+        if mode == 'val':
+            data_iterator.switch_to_val_data(sess)
+        else:
+            data_iterator.switch_to_test_data(sess)
+
+        refs, hypos = [], []
+        while True:
+            try:
+                fetches = [
+                    batch['target_text'][:, 1:],
+                    infer_outputs.predicted_ids[:, :, 0]
+                ]
+                feed_dict = {
+                    tx.global_mode(): tf.estimator.ModeKeys.EVAL
+                }
+                target_texts_ori, output_ids = \
+                    sess.run(fetches, feed_dict=feed_dict)
+
+                target_texts = tx.utils.strip_special_tokens(target_texts_ori)
+                output_texts = tx.utils.map_ids_to_strs(
+                    ids=output_ids, vocab=val_data.target_vocab)
+
+                for hypo, ref in zip(output_texts, target_texts):
+                    hypos.append(hypo)
+                    refs.append([ref])
+            except tf.errors.OutOfRangeError:
+                break
+
+        return tx.evals.corpus_bleu_moses(list_of_references=refs,
+                                          hypotheses=hypos)
+
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        sess.run(tf.local_variables_initializer())
+        sess.run(tf.tables_initializer())
+
+        best_val_bleu = -1.
+        for i in range(config_data.num_epochs):
+            _train_epoch(sess)
+
+            val_bleu = _eval_epoch(sess, 'val')
+            best_val_bleu = max(best_val_bleu, val_bleu)
+            print('val epoch={}, BLEU={:.4f}; best-ever={:.4f}'.format(
+                i, val_bleu, best_val_bleu))
+
+            test_bleu = _eval_epoch(sess, 'test')
+            print('test epoch={}, BLEU={:.4f}'.format(i, test_bleu))
+
+            print('=' * 50)
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/examples/differentiable_expected_bleu/prepare_data.py b/examples/differentiable_expected_bleu/prepare_data.py
new file mode 100644
index 00000000..a5cc357b
--- /dev/null
+++ b/examples/differentiable_expected_bleu/prepare_data.py
@@ -0,0 +1,53 @@
+# Copyright 2018 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Downloads data.
+"""
+import tensorflow as tf
+import texar as tx
+
+# pylint: disable=invalid-name
+
+flags = tf.flags
+
+flags.DEFINE_string("data", "iwslt14", "Data to download [iwslt14|toy_copy]")
+
+FLAGS = flags.FLAGS
+
+def prepare_data():
+    """Downloads data.
+    """
+    if FLAGS.data == 'iwslt14':
+        tx.data.maybe_download(
+            urls='https://drive.google.com/file/d/'
+                 '1Vuv3bed10qUxrpldHdYoiWLzPKa4pNXd/view?usp=sharing',
+            path='./',
+            filenames='iwslt14.zip',
+            extract=True)
+    elif FLAGS.data == 'toy_copy':
+        tx.data.maybe_download(
+            urls='https://drive.google.com/file/d/'
+                 '1fENE2rakm8vJ8d3voWBgW4hGlS6-KORW/view?usp=sharing',
+            path='./',
+            filenames='toy_copy.zip',
+            extract=True)
+    else:
+        raise ValueError('Unknown data: {}'.format(FLAGS.data))
+
+def main():
+    """Entrypoint.
+    """
+    prepare_data()
+
+if __name__ == '__main__':
+    main()
diff --git a/texar/losses/__init__.py b/texar/losses/__init__.py
index c684911c..48586d40 100644
--- a/texar/losses/__init__.py
+++ b/texar/losses/__init__.py
@@ -27,3 +27,4 @@
 from texar.losses.adv_losses import *
 from texar.losses.rewards import *
 from texar.losses.entropy import *
+from texar.losses.differentiable_expected_bleu import *
diff --git a/texar/losses/differentiable_expected_bleu.py b/texar/losses/differentiable_expected_bleu.py
new file mode 100644
index 00000000..5974e036
--- /dev/null
+++ b/texar/losses/differentiable_expected_bleu.py
@@ -0,0 +1,129 @@
+# Copyright 2018 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Differentiable Expected BLEU loss
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+# pylint: disable=invalid-name, not-context-manager, protected-access,
+# pylint: disable=too-many-arguments
+
+__all__ = [
+    "differentiable_expected_bleu",
+]
+
+def differentiable_expected_bleu(labels,
+                                 logits,
+                                 sequence_length,
+                                 time_major=False,
+                                 min_fn=lambda x: tf.minimum(1., x),
+                                 max_order=4,
+                                 weights=[.1, .3, .3, .3],
+                                 smooth_add=1e-9,
+                                 name=None):
+    """Computes sparse softmax cross entropy for each time step of sequence
+    predictions.
+
+    Args:
+        labels: Target class indexes. I.e., classes are mutually exclusive
+            (each entry is in exactly one class).
+
+            - If :attr:`time_major` is `False` (default), this must be\
+            a Tensor of shape `[batch_size, max_time]`.
+
+            - If `time_major` is `True`, this must be a Tensor of shape\
+            `[max_time, batch_size].`
+        logits: Unscaled log probabilities. This must have the shape of
+            `[max_time, batch_size, num_classes]` or
+            `[batch_size, max_time, num_classes]` according to
+            the value of `time_major`.
+        sequence_length: A Tensor of shape `[batch_size]`. Time steps beyond
+            the respective sequence lengths will have zero losses.
+        time_major (bool): The shape format of the inputs. If `True`,
+            :attr:`labels` and :attr:`logits` must have shape
+            `[max_time, batch_size, ...]`. If `False`
+            (default), they must have shape `[batch_size, max_time, ...]`.
+        name (str, optional): A name for the operation.
+
+    Returns:
+        A Tensor containing the loss of rank 0.
+
+    Example:
+
+        .. code-block:: python
+
+            embedder = WordEmbedder(vocab_size=data.vocab.size)
+            decoder = BasicRNNDecoder(vocab_size=data.vocab.size)
+            outputs, _, _ = decoder(
+                decoding_strategy='train_greedy',
+                inputs=embedder(data_batch['text_ids']),
+                sequence_length=data_batch['length']-1)
+
+            loss = sequence_sparse_softmax_cross_entropy(
+                labels=data_batch['text_ids'][:, 1:],
+                logits=outputs.logits,
+                sequence_length=data_batch['length']-1)
+
+    """ # TODO: rewrite example
+    with tf.name_scope(name, "sequence_sparse_softmax_cross_entropy"):
+        X = logits
+        Y = labels
+
+        if time_major:
+            X = tf.transpose(X, [1, 0, 2])
+            Y = tf.transpose(Y, [1, 0])
+        
+        sizeX = tf.shape(X)[1]
+        sizeY = tf.shape(Y)[1]
+
+        XY = tf.batch_gather(X, tf.tile(tf.expand_dims(tf.to_int32(Y), 1), [1, sizeX, 1]))
+        YY = tf.to_float(tf.equal(tf.expand_dims(Y, 2), tf.expand_dims(Y, 1)))
+
+        maskX = tf.sequence_mask(
+            sequence_length + 1, maxlen=sizeX + 1, dtype=tf.float32)
+        maskY = tf.sequence_mask(
+            sequence_length + 1, maxlen=sizeY + 1, dtype=tf.float32)
+        matchXY = tf.expand_dims(maskX, 2) * tf.expand_dims(maskY, 1)
+        matchYY = tf.minimum(tf.expand_dims(maskY, 2),
+                             tf.expand_dims(maskY, 1))
+
+        tot = []
+        o = []
+
+        for order in range(max_order):
+            matchXY = XY[:, : sizeX - order, : sizeY - order] * matchXY[:, 1:, 1:]
+            matchYY = YY[:, : sizeY - order, : sizeY - order] * matchYY[:, 1:, 1:]
+            cntYX = tf.reduce_sum(matchXY, 1, keepdims=True)
+            cntYY = tf.reduce_sum(matchYY, 1, keepdims=True)
+            o_order = tf.reduce_sum(tf.reduce_sum(
+                min_fn(cntYY / (cntYX - matchXY + 1))
+                * matchXY / tf.maximum(1., cntYY),
+                2), 1)
+            # in order to avoid dividing 0
+            tot_order = tf.maximum(1, sequence_length - order)
+            tot.append(tot_order)
+            o.append(o_order)
+
+        tot = tf.stack(tot, 1)
+        o = tf.stack(o, 1)
+        prec = tf.reduce_sum(o, 0) / tf.to_float(tf.reduce_sum(tot, 0))
+        neglog_prec = -tf.log(prec + smooth_add)
+        loss = tf.reduce_sum(weights * neglog_prec, 0)
+        
+        return loss

From b83a145a688d0614b2e3468814986186d6ee9962 Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Fri, 5 Oct 2018 01:10:27 -0400
Subject: [PATCH 02/65] modify DEBLEU loss interface from logits to probs

---
 texar/losses/differentiable_expected_bleu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/texar/losses/differentiable_expected_bleu.py b/texar/losses/differentiable_expected_bleu.py
index 5974e036..521fc903 100644
--- a/texar/losses/differentiable_expected_bleu.py
+++ b/texar/losses/differentiable_expected_bleu.py
@@ -29,7 +29,7 @@
 ]
 
 def differentiable_expected_bleu(labels,
-                                 logits,
+                                 probs,
                                  sequence_length,
                                  time_major=False,
                                  min_fn=lambda x: tf.minimum(1., x),
@@ -82,7 +82,7 @@ def differentiable_expected_bleu(labels,
 
     """ # TODO: rewrite example
     with tf.name_scope(name, "sequence_sparse_softmax_cross_entropy"):
-        X = logits
+        X = probs
         Y = labels
 
         if time_major:

From 87bd449fa65290c96d9bb4f96adfffc3e31c277a Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Fri, 5 Oct 2018 01:12:32 -0400
Subject: [PATCH 03/65] add TeacherMaskSoftmaxEmbeddingHelper

---
 .../differentiable_expected_bleu.py           |  12 +-
 texar/modules/decoders/rnn_decoder_helpers.py | 104 ++++++++++++++++--
 2 files changed, 100 insertions(+), 16 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 7ba581ec..ea36d62c 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -61,10 +61,12 @@ def build_model(batch, train_data):
             train_data.target_vocab.bos_token_id
     end_token = train_data.target_vocab.eos_token_id
 
-    helper = tx.modules.GumbelSoftmaxEmbeddingHelper(
+    helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper(
+        inputs=batch['target_text_ids'],
+        sequence_length=batch['target_length']-1,
         embedding=target_embedder,
-        start_tokens=start_tokens,
-        end_token=end_token,
+        n_unmask=1,
+        n_mask=0,
         tau=config_train.tau)
 
     training_outputs, _, _ = decoder(
@@ -75,8 +77,8 @@ def build_model(batch, train_data):
         tx.losses.differentiable_expected_bleu(
             #TODO: decide whether to include BOS
             labels=batch['target_text_ids'][:, 1:],
-            logits=training_outputs.logits,
-            sequence_length=batch['target_length'] - 1))
+            probs=training_outputs.sample_id,
+            sequence_length=batch['target_length']-1))
 
     beam_search_outputs, _, _ = \
         tx.modules.beam_search_decode(
diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py
index 559f3c29..24ec60a4 100644
--- a/texar/modules/decoders/rnn_decoder_helpers.py
+++ b/texar/modules/decoders/rnn_decoder_helpers.py
@@ -38,6 +38,7 @@
     "_get_training_helper",
     "GumbelSoftmaxEmbeddingHelper",
     "SoftmaxEmbeddingHelper",
+    "TeacherMaskSoftmaxEmbeddingHelper",
 ]
 
 def default_helper_train_hparams():
@@ -185,6 +186,17 @@ def _get_training_helper( #pylint: disable=invalid-name
     return helper
 
 
+def get_embedding_and_fn(embedding):
+    if isinstance(embedding, EmbedderBase):
+        embedding = embedding.embedding
+
+    if callable(embedding):
+        raise ValueError("`embedding` must be an embedding tensor or an "
+                         "instance of subclass of `EmbedderBase`.")
+    else:
+        return embedding, (lambda ids: tf.nn.embedding_lookup(embedding, ids))
+
+
 class SoftmaxEmbeddingHelper(TFHelper):
     """A helper that feeds softmax probabilities over vocabulary
     to the next step.
@@ -215,17 +227,7 @@ class SoftmaxEmbeddingHelper(TFHelper):
 
     def __init__(self, embedding, start_tokens, end_token, tau,
                  stop_gradient=False, use_finish=True):
-        if isinstance(embedding, EmbedderBase):
-            embedding = embedding.embedding
-
-        if callable(embedding):
-            raise ValueError("`embedding` must be an embedding tensor or an "
-                             "instance of subclass of `EmbedderBase`.")
-        else:
-            self._embedding = embedding
-            self._embedding_fn = (
-                lambda ids: tf.nn.embedding_lookup(embedding, ids))
-
+        self._embedding, self._embedding_fn = get_embedding_and_fn(embedding)
         self._start_tokens = tf.convert_to_tensor(
             start_tokens, dtype=tf.int32, name="start_tokens")
         self._end_token = tf.convert_to_tensor(
@@ -326,3 +328,83 @@ def sample(self, time, outputs, state, name=None):
             sample_ids = tf.stop_gradient(sample_ids_hard - sample_ids) \
                          + sample_ids
         return sample_ids
+
+
+class TeacherMaskSoftmaxEmbeddingHelper(TFTrainingHelper):
+    def __init__(self, inputs, sequence_length, embedding, n_unmask,
+                 n_mask, tau=1., time_major=False, seed=None,
+                 stop_gradient=False):
+        super(TeacherMaskSoftmaxEmbeddingHelper, self).__init__(
+            inputs=inputs,
+            sequence_length=sequence_length,
+            time_major=time_major)
+
+        self._embedding, self._embedding_fn = get_embedding_and_fn(embedding)
+        self._tau = tau
+        self._seed = seed
+        self._stop_gradient = stop_gradient
+
+        self._zero_next_inputs = tf.zeros_like(
+            self._embedding_fn(self._zero_inputs))
+
+        self._n_unmask = tf.Variable(n_unmask, name='n_unmask')
+        self._n_mask = tf.Variable(n_mask, name='n_mask')
+        self._n_cycle = tf.add(self._n_unmask, self._n_mask, name='n_cycle')
+        self._new_n_unmask = tf.placeholder(shape=[], dtype=tf.int32)
+        self._new_n_mask = tf.placeholder(shape=[], dtype=tf.int32)
+        self._assign_n_unmask = tf.assign(self._n_unmask, self._new_n_unmask)
+        self._assign_n_mask = tf.assign(self._n_mask, self._new_n_mask)
+        self._n_shift = tf.random_uniform(
+            [], maxval=self._n_cycle, dtype=self._n_cycle.dtype,
+            seed=self._seed, name='n_shift')
+
+    @property
+    def sample_ids_dtype(self):
+        return tf.float32
+
+    @property
+    def sample_ids_shape(self):
+        return self._embedding.get_shape()[:1]
+
+    def assign_mask_pattern(self, n_unmask, n_mask, sess):
+        sess.run([self._assign_n_unmask, self._assign_n_mask],
+                 feed_dict={self._new_n_unmask: n_unmask,
+                            self._new_n_mask: n_mask})
+
+    def _is_masked(self, time):
+        return time % self._n_cycle < self._n_mask
+
+    def initialize(self, name=None):
+        finished = tf.equal(0, self._sequence_length)
+        all_finished = tf.reduce_all(finished)
+        next_inputs = tf.cond(
+            all_finished,
+            lambda: self._zero_next_inputs,
+            lambda: self._embedding_fn(self._input_tas.read(0)))
+        return (finished, next_inputs)
+
+    def sample(self, time, outputs, state, name=None):
+        """Returns `sample_id` of shape `[batch_size, vocab_size]`.
+        """
+        next_time = time + 1
+        sample_ids = tf.cond(
+            self._is_masked(next_time),
+            lambda: tf.one_hot(self._input_tas.read(next_time),
+                               self._embedding.get_shape()[0]),
+            lambda: tf.nn.softmax(outputs / self._tau))
+        return sample_ids
+
+    def next_inputs(self, time, outputs, state, sample_ids, name=None):
+        next_time = time + 1
+        finished = (next_time >= self._sequence_length)
+        all_finished = tf.reduce_all(finished)
+        if self._stop_gradient:
+            sample_ids = tf.stop_gradient(sample_ids)
+        next_inputs = tf.cond(
+            all_finished,
+            lambda: self._zero_next_inputs,
+            lambda: tf.cond(  # for efficiency
+                self._is_masked(next_time),
+                lambda: self._embedding_fn(self._input_tas.read(next_time)),
+                lambda: tf.matmul(sample_ids, self._embedding)))
+        return (finished, next_inputs, state)

From a730a254f36e9313b563d131b6c95bfc3b0418ca Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Fri, 5 Oct 2018 16:31:33 -0400
Subject: [PATCH 04/65] change API of sess

---
 texar/modules/decoders/rnn_decoder_helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py
index 24ec60a4..1d442c9a 100644
--- a/texar/modules/decoders/rnn_decoder_helpers.py
+++ b/texar/modules/decoders/rnn_decoder_helpers.py
@@ -366,7 +366,7 @@ def sample_ids_dtype(self):
     def sample_ids_shape(self):
         return self._embedding.get_shape()[:1]
 
-    def assign_mask_pattern(self, n_unmask, n_mask, sess):
+    def assign_mask_pattern(self, sess, n_unmask, n_mask):
         sess.run([self._assign_n_unmask, self._assign_n_mask],
                  feed_dict={self._new_n_unmask: n_unmask,
                             self._new_n_mask: n_mask})

From 86c2f9efd17e2d037684ef80f3c83322abdb2f7b Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Fri, 5 Oct 2018 16:32:21 -0400
Subject: [PATCH 05/65] add xe ; refine configs

---
 .../config_iwslt14.py                         |  21 ++-
 .../config_model.py                           |   6 +-
 .../config_train.py                           |   3 +
 .../differentiable_expected_bleu.py           | 146 +++++++++---------
 4 files changed, 91 insertions(+), 85 deletions(-)

diff --git a/examples/differentiable_expected_bleu/config_iwslt14.py b/examples/differentiable_expected_bleu/config_iwslt14.py
index 0c36dc73..3fbff240 100644
--- a/examples/differentiable_expected_bleu/config_iwslt14.py
+++ b/examples/differentiable_expected_bleu/config_iwslt14.py
@@ -1,12 +1,8 @@
-
-num_epochs = 15
-display = 500
-
-source_vocab_file = './data/iwslt14/vocab.de'
-target_vocab_file = './data/iwslt14/vocab.en'
+source_vocab_file = 'data/iwslt14/vocab.de'
+target_vocab_file = 'data/iwslt14/vocab.en'
 
 train = {
-    'batch_size': 32,
+    'batch_size': 80,
     'allow_smaller_final_batch': False,
     'source_dataset': {
         "files": 'data/iwslt14/train.de',
@@ -17,10 +13,11 @@
         'files': 'data/iwslt14/train.en',
         'vocab_file': target_vocab_file,
         'max_seq_length': 50
-    }
+    },
+    'allow_smaller_final_batch': False,
 }
 val = {
-    'batch_size': 32,
+    'batch_size': 80,
     'shuffle': False,
     'source_dataset': {
         "files": 'data/iwslt14/valid.de',
@@ -29,10 +26,10 @@
     'target_dataset': {
         'files': 'data/iwslt14/valid.en',
         'vocab_file': target_vocab_file,
-    }
+    },
 }
 test = {
-    'batch_size': 32,
+    'batch_size': 80,
     'shuffle': False,
     'source_dataset': {
         "files": 'data/iwslt14/test.de',
@@ -41,5 +38,5 @@
     'target_dataset': {
         'files': 'data/iwslt14/test.en',
         'vocab_file': target_vocab_file,
-    }
+    },
 }
diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model.py
index 8ef3c9b3..3ba0c867 100644
--- a/examples/differentiable_expected_bleu/config_model.py
+++ b/examples/differentiable_expected_bleu/config_model.py
@@ -1,11 +1,11 @@
 # Attentional Seq2seq model.
 # Hyperparameters not specified here will take the default values.
 
-num_units = 256
-beam_width = 10
+num_units = 1000
+embedding_dim = 500
 
 embedder = {
-    'dim': num_units
+    'dim': embedding_dim
 }
 encoder = {
     'rnn_cell_fw': {
diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py
index e1b30a36..f0669550 100644
--- a/examples/differentiable_expected_bleu/config_train.py
+++ b/examples/differentiable_expected_bleu/config_train.py
@@ -1 +1,4 @@
+max_epochs = 1000
 tau = 1.
+infer_beam_width = 1
+infer_max_decoding_length = 50
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index ea36d62c..37fcb860 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -37,6 +37,13 @@
 config_data = importlib.import_module(FLAGS.config_data)
 
 
+def get_data_loader(sess, fetches, feed_dict):
+    while True:
+        try:
+            yield sess.run(fetches, feed_dict=feed_dict)
+        except tf.errors.OutOfRangeError:
+            break
+
 def build_model(batch, train_data):
     """Assembles the seq2seq model.
     """
@@ -57,39 +64,51 @@ def build_model(batch, train_data):
         vocab_size=train_data.target_vocab.size,
         hparams=config_model.decoder)
 
-    start_tokens = tf.ones_like(batch['target_length']) * \
-            train_data.target_vocab.bos_token_id
-    end_token = train_data.target_vocab.eos_token_id
+    # cross-entropy + teacher-forcing pretraining
+    tf_outputs, _, _ = decoder(
+        decoding_strategy='train_greedy',
+        inputs=target_embedder(batch['target_text_ids'][:, :-1]),
+        sequence_length=batch['target_length']-1)
 
-    helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper(
-        inputs=batch['target_text_ids'],
+    train_xe_op = tx.core.get_train_op(
+        tx.losses.sequence_sparse_softmax_cross_entropy(
+            labels=batch['target_text_ids'][:, 1:],
+            logits=tf_outputs.logits,
+            sequence_length=batch['target_length']-1))
+
+    # teacher mask + DEBLEU fine-tuning
+    tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper(
+        inputs=batch['target_text_ids'][:, :-1],
         sequence_length=batch['target_length']-1,
         embedding=target_embedder,
         n_unmask=1,
         n_mask=0,
         tau=config_train.tau)
 
-    training_outputs, _, _ = decoder(
-        helper=helper,
-        max_decoding_length=50)
+    tm_outputs, _, _ = decoder(
+        helper=tm_helper)
 
-    train_op = tx.core.get_train_op(
+    train_debleu_op = tx.core.get_train_op(
         tx.losses.differentiable_expected_bleu(
             #TODO: decide whether to include BOS
             labels=batch['target_text_ids'][:, 1:],
-            probs=training_outputs.sample_id,
+            probs=tm_outputs.sample_id,
             sequence_length=batch['target_length']-1))
 
-    beam_search_outputs, _, _ = \
-        tx.modules.beam_search_decode(
-            decoder_or_cell=decoder,
-            embedding=target_embedder,
-            start_tokens=start_tokens,
-            end_token=end_token,
-            beam_width=config_model.beam_width,
-            max_decoding_length=50)
+    # inference: beam search decoding
+    start_tokens = tf.ones_like(batch['target_length']) * \
+            train_data.target_vocab.bos_token_id
+    end_token = train_data.target_vocab.eos_token_id
+
+    bs_outputs, _, _ = tx.modules.beam_search_decode(
+        decoder_or_cell=decoder,
+        embedding=target_embedder,
+        start_tokens=start_tokens,
+        end_token=end_token,
+        beam_width=config_train.infer_beam_width,
+        max_decoding_length=config_train.infer_max_decoding_length)
 
-    return train_op, beam_search_outputs
+    return train_xe_op, train_debleu_op, bs_outputs
 
 
 def main():
@@ -98,55 +117,47 @@ def main():
     train_data = tx.data.PairedTextData(hparams=config_data.train)
     val_data = tx.data.PairedTextData(hparams=config_data.val)
     test_data = tx.data.PairedTextData(hparams=config_data.test)
-    data_iterator = tx.data.TrainTestDataIterator(
-        train=train_data, val=val_data, test=test_data)
+    data_iterator = tx.data.FeedableDataIterator(
+        {'train': train_data, 'val': val_data, 'test': test_data})
 
-    batch = data_iterator.get_next()
+    data_batch = data_iterator.get_next()
 
-    train_op, infer_outputs = build_model(batch, train_data)
+    train_xe_op, train_debleu_op, infer_outputs = \
+        build_model(data_batch, train_data)
 
     def _train_epoch(sess):
-        data_iterator.switch_to_train_data(sess)
-
-        step = 0
-        while True:
-            try:
-                loss = sess.run(train_op)
-                if step % config_data.display == 0:
-                    print("step={}, loss={:.4f}".format(step, loss))
-                step += 1
-            except tf.errors.OutOfRangeError:
-                break
+        data_iterator.restart_dataset(sess, 'train')
+        feed_dict = {
+            tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
+            data_iterator.handle: data_iterator.get_handle(sess, 'train')
+        }
 
-    def _eval_epoch(sess, mode):
-        if mode == 'val':
-            data_iterator.switch_to_val_data(sess)
-        else:
-            data_iterator.switch_to_test_data(sess)
-
-        refs, hypos = [], []
-        while True:
-            try:
-                fetches = [
-                    batch['target_text'][:, 1:],
-                    infer_outputs.predicted_ids[:, :, 0]
-                ]
-                feed_dict = {
-                    tx.global_mode(): tf.estimator.ModeKeys.EVAL
-                }
-                target_texts_ori, output_ids = \
-                    sess.run(fetches, feed_dict=feed_dict)
-
-                target_texts = tx.utils.strip_special_tokens(target_texts_ori)
-                output_texts = tx.utils.map_ids_to_strs(
-                    ids=output_ids, vocab=val_data.target_vocab)
-
-                for hypo, ref in zip(output_texts, target_texts):
-                    hypos.append(hypo)
-                    refs.append([ref])
-            except tf.errors.OutOfRangeError:
-                break
+        for batch_i, batch in \
+                enumerate(get_data_loader(sess, data_batch, feed_dict)):
+            loss = sess.run(train_xe_op, feed_dict=feed_dict)
 
+    def _eval_epoch(sess, mode):
+        data_iterator.restart_dataset(sess, mode)
+        feed_dict = {
+            tx.global_mode(): tf.estimator.ModeKeys.EVAL,
+            data_iterator.handle: data_iterator.get_handle(sess, mode)
+        }
+
+        ref_hypo_pairs = []
+        fetches = [
+            batch['target_text'][:, 1:],
+            infer_outputs.predicted_ids[:, :, 0]
+        ]
+        for target_texts_ori, output_ids in \
+                get_data_loader(sess, fetches, feed_dict):
+            target_texts = tx.utils.strip_special_tokens(target_texts_ori)
+            output_texts = tx.utils.map_ids_to_strs(
+                ids=output_ids, vocab=val_data.target_vocab)
+
+            ref_hypo_pairs.extend(
+                zip(map(lambda x: [x], target_texts), output_texts))
+
+        refs, hypos = zip(*ref_hypo_pairs)
         return tx.evals.corpus_bleu_moses(list_of_references=refs,
                                           hypotheses=hypos)
 
@@ -155,19 +166,14 @@ def _eval_epoch(sess, mode):
         sess.run(tf.local_variables_initializer())
         sess.run(tf.tables_initializer())
 
-        best_val_bleu = -1.
-        for i in range(config_data.num_epochs):
+        epoch = 0
+        while epoch < config_train.max_epochs:
             _train_epoch(sess)
+            epoch += 1
 
             val_bleu = _eval_epoch(sess, 'val')
-            best_val_bleu = max(best_val_bleu, val_bleu)
-            print('val epoch={}, BLEU={:.4f}; best-ever={:.4f}'.format(
-                i, val_bleu, best_val_bleu))
 
             test_bleu = _eval_epoch(sess, 'test')
-            print('test epoch={}, BLEU={:.4f}'.format(i, test_bleu))
-
-            print('=' * 50)
 
 
 if __name__ == '__main__':

From e10c78b75b73fab78840d5aa3a6b447f28ab3020 Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Fri, 5 Oct 2018 21:41:55 -0400
Subject: [PATCH 06/65] fix a typo in doc

---
 texar/core/optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/texar/core/optimization.py b/texar/core/optimization.py
index 1e24e9b3..af48c17d 100644
--- a/texar/core/optimization.py
+++ b/texar/core/optimization.py
@@ -125,7 +125,7 @@ def default_optimization_hparams():
         :tf_main:`tf.clip_by_average_norm <clip_by_average_norm>`, etc.
 
         "type" specifies the gradient clip function, and can be a function,
-        or its name or mudule path. If function name is provided, the
+        or its name or module path. If function name is provided, the
         function must be from module :tf_main:`tf < >` or :mod:`texar.custom`.
 
         "kwargs" specifies keyword arguments to the function, except arguments

From bdbca3b5ff98d7d2eae51f04c32569923847059a Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Fri, 5 Oct 2018 22:47:54 -0400
Subject: [PATCH 07/65] add summary and checkpoints ; add train configs

---
 .../config_train.py                           | 25 ++++++++
 .../differentiable_expected_bleu.py           | 63 ++++++++++++++-----
 2 files changed, 72 insertions(+), 16 deletions(-)

diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py
index f0669550..2cbe8dd7 100644
--- a/examples/differentiable_expected_bleu/config_train.py
+++ b/examples/differentiable_expected_bleu/config_train.py
@@ -1,4 +1,29 @@
 max_epochs = 1000
+steps_per_eval = 500
 tau = 1.
 infer_beam_width = 1
 infer_max_decoding_length = 50
+
+train_xe = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+}
+
+train_debleu = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+}
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 37fcb860..ee042be0 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -21,6 +21,7 @@
 #pylint: disable=invalid-name, too-many-arguments, too-many-locals
 
 import importlib
+import os
 import tensorflow as tf
 import texar as tx
 
@@ -74,7 +75,8 @@ def build_model(batch, train_data):
         tx.losses.sequence_sparse_softmax_cross_entropy(
             labels=batch['target_text_ids'][:, 1:],
             logits=tf_outputs.logits,
-            sequence_length=batch['target_length']-1))
+            sequence_length=batch['target_length']-1),
+        hparams=config_train.train_xe)
 
     # teacher mask + DEBLEU fine-tuning
     tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper(
@@ -93,7 +95,8 @@ def build_model(batch, train_data):
             #TODO: decide whether to include BOS
             labels=batch['target_text_ids'][:, 1:],
             probs=tm_outputs.sample_id,
-            sequence_length=batch['target_length']-1))
+            sequence_length=batch['target_length']-1),
+        hparams=config_train.train_debleu)
 
     # inference: beam search decoding
     start_tokens = tf.ones_like(batch['target_length']) * \
@@ -122,21 +125,29 @@ def main():
 
     data_batch = data_iterator.get_next()
 
+    global_step = tf.train.create_global_step()
+
     train_xe_op, train_debleu_op, infer_outputs = \
         build_model(data_batch, train_data)
 
-    def _train_epoch(sess):
+    merged_summary = tf.summary.merge_all()
+
+    saver = tf.train.Saver(max_to_keep=None)
+
+    def _train_epoch(sess, summary_writer):
         data_iterator.restart_dataset(sess, 'train')
         feed_dict = {
             tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
             data_iterator.handle: data_iterator.get_handle(sess, 'train')
         }
 
-        for batch_i, batch in \
-                enumerate(get_data_loader(sess, data_batch, feed_dict)):
-            loss = sess.run(train_xe_op, feed_dict=feed_dict)
+        for loss, summary, step in get_data_loader(
+                sess, (train_xe_op, merged_summary, global_step), feed_dict):
+            summary_writer.add_summary(summary, step)
+            if step % config_train.steps_per_eval == 0:
+                _eval_epoch(sess, summary_writer, 'val')
 
-    def _eval_epoch(sess, mode):
+    def _eval_epoch(sess, summary_writer, mode):
         data_iterator.restart_dataset(sess, mode)
         feed_dict = {
             tx.global_mode(): tf.estimator.ModeKeys.EVAL,
@@ -145,7 +156,7 @@ def _eval_epoch(sess, mode):
 
         ref_hypo_pairs = []
         fetches = [
-            batch['target_text'][:, 1:],
+            data_batch['target_text'][:, 1:],
             infer_outputs.predicted_ids[:, :, 0]
         ]
         for target_texts_ori, output_ids in \
@@ -158,22 +169,42 @@ def _eval_epoch(sess, mode):
                 zip(map(lambda x: [x], target_texts), output_texts))
 
         refs, hypos = zip(*ref_hypo_pairs)
-        return tx.evals.corpus_bleu_moses(list_of_references=refs,
+        bleu = tx.evals.corpus_bleu_moses(list_of_references=refs,
                                           hypotheses=hypos)
+        step = tf.train.global_step(sess, global_step)
+        summary = tf.Summary()
+        summary.value.add(tag='{}/BLEU'.format(mode), simple_value=bleu)
+        summary_writer.add_summary(summary, step)
+        return bleu
 
+    best_val_bleu = -1
     with tf.Session() as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
         sess.run(tf.tables_initializer())
+        ckpt_name = 'ckpt/model.ckpt'
+        if os.path.exists('ckpt') and tf.train.checkpoint_exists(ckpt_name):
+            print('restoring from {} ...'.format(ckpt_name))
+            saver.restore(sess, ckpt_name)
+        else:
+            sess.run(tf.global_variables_initializer())
+            sess.run(tf.local_variables_initializer())
+
+        summary_writer = tf.summary.FileWriter('log', sess.graph)
 
         epoch = 0
         while epoch < config_train.max_epochs:
-            _train_epoch(sess)
+            val_bleu = _eval_epoch(sess, summary_writer, 'val')
+            if val_bleu > best_val_bleu:
+                best_val_bleu = val_bleu
+                print('epoch: {}, step: {}, best val bleu: {}'.format(
+                    epoch,
+                    tf.train.global_step(sess, global_step),
+                    best_val_bleu))
+                saved_path = saver.save(sess, 'ckpt/best.ckpt')
+                print('saved to {}'.format(saved_path))
+            _train_epoch(sess, summary_writer)
             epoch += 1
-
-            val_bleu = _eval_epoch(sess, 'val')
-
-            test_bleu = _eval_epoch(sess, 'test')
+            saved_path = saver.save(sess, 'ckpt/model.ckpt')
+            print('saved to {}'.format(saved_path))
 
 
 if __name__ == '__main__':

From ffbf14d99207c2ff4c4dfbfe95816232c28cbf5b Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Sat, 6 Oct 2018 13:46:35 -0400
Subject: [PATCH 08/65] remove duplicated config

---
 examples/differentiable_expected_bleu/config_iwslt14.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/differentiable_expected_bleu/config_iwslt14.py b/examples/differentiable_expected_bleu/config_iwslt14.py
index 3fbff240..e6b40e97 100644
--- a/examples/differentiable_expected_bleu/config_iwslt14.py
+++ b/examples/differentiable_expected_bleu/config_iwslt14.py
@@ -14,7 +14,6 @@
         'vocab_file': target_vocab_file,
         'max_seq_length': 50
     },
-    'allow_smaller_final_batch': False,
 }
 val = {
     'batch_size': 80,

From 7e5b92bbfb42b2547d833126c48a353512c38176 Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Sat, 6 Oct 2018 17:34:49 -0400
Subject: [PATCH 09/65] copy tf.batch_gather

---
 texar/losses/differentiable_expected_bleu.py | 75 +++++++++++++++++++-
 1 file changed, 72 insertions(+), 3 deletions(-)

diff --git a/texar/losses/differentiable_expected_bleu.py b/texar/losses/differentiable_expected_bleu.py
index 521fc903..1f3cc1f7 100644
--- a/texar/losses/differentiable_expected_bleu.py
+++ b/texar/losses/differentiable_expected_bleu.py
@@ -28,6 +28,73 @@
     "differentiable_expected_bleu",
 ]
 
+def batch_gather(params, indices, name=None):
+  """This function is copied and modified from tensorflow 11.0.
+  Gather slices from `params` according to `indices` with leading batch dims.
+  This operation assumes that the leading dimensions of `indices` are dense,
+  and the gathers on the axis corresponding to the last dimension of `indices`.
+  More concretely it computes:
+  result[i1, ..., in] = params[i1, ..., in-1, indices[i1, ..., in]]
+  Therefore `params` should be a Tensor of shape [A1, ..., AN, B1, ..., BM],
+  `indices` should be a Tensor of shape [A1, ..., AN-1, C] and `result` will be
+  a Tensor of size `[A1, ..., AN-1, C, B1, ..., BM]`.
+  In the case in which indices is a 1D tensor, this operation is equivalent to
+  `tf.gather`.
+  See also `tf.gather` and `tf.gather_nd`.
+  Args:
+    params: A Tensor. The tensor from which to gather values.
+    indices: A Tensor. Must be one of the following types: int32, int64. Index
+        tensor. Must be in range `[0, params.shape[axis]`, where `axis` is the
+        last dimension of `indices` itself.
+    name: A name for the operation (optional).
+  Returns:
+    A Tensor. Has the same type as `params`.
+  Raises:
+    ValueError: if `indices` has an unknown shape.
+  """
+
+  with tf.name_scope(name):
+    indices = tf.convert_to_tensor(indices, name="indices")
+    params = tf.convert_to_tensor(params, name="params")
+    indices_shape = tf.shape(indices)
+    params_shape = tf.shape(params)
+
+    ndims = indices.shape.ndims
+    if ndims is None:
+      raise ValueError("batch_gather does not allow indices with unknown "
+                       "shape.")
+    batch_indices = indices
+    indices_dtype = indices.dtype.base_dtype
+    accum_dim_value = tf.ones((), dtype=indices_dtype)
+    # Use correct type for offset index computation
+    casted_params_shape = tf.cast(params_shape, indices_dtype)
+    for dim in range(ndims-1, 0, -1):
+      dim_value = casted_params_shape[dim-1]
+      accum_dim_value *= casted_params_shape[dim]
+      start = tf.zeros((), dtype=indices_dtype)
+      step = tf.ones((), dtype=indices_dtype)
+      dim_indices = tf.range(start, dim_value, step)
+      dim_indices *= accum_dim_value
+      dim_shape = tf.stack([1] * (dim - 1) + [dim_value] + [1] * (ndims - dim),
+                           axis=0)
+      batch_indices += tf.reshape(dim_indices, dim_shape)
+
+    flat_indices = tf.reshape(batch_indices, [-1])
+    outer_shape = params_shape[ndims:]
+    flat_inner_shape = tf.reduce_prod(params_shape[:ndims])
+
+    flat_params = tf.reshape(
+        params, tf.concat([[flat_inner_shape], outer_shape], axis=0))
+    flat_result = tf.gather(flat_params, flat_indices)
+    result = tf.reshape(
+        flat_result, tf.concat([indices_shape, outer_shape], axis=0))
+    final_shape = indices.get_shape()[:ndims-1].merge_with(
+        params.get_shape()[:ndims -1])
+    final_shape = final_shape.concatenate(indices.get_shape()[ndims-1])
+    final_shape = final_shape.concatenate(params.get_shape()[ndims:])
+    result.set_shape(final_shape)
+    return result
+
 def differentiable_expected_bleu(labels,
                                  probs,
                                  sequence_length,
@@ -92,7 +159,7 @@ def differentiable_expected_bleu(labels,
         sizeX = tf.shape(X)[1]
         sizeY = tf.shape(Y)[1]
 
-        XY = tf.batch_gather(X, tf.tile(tf.expand_dims(tf.to_int32(Y), 1), [1, sizeX, 1]))
+        XY = batch_gather(X, tf.tile(tf.expand_dims(Y, 1), [1, sizeX, 1]))
         YY = tf.to_float(tf.equal(tf.expand_dims(Y, 2), tf.expand_dims(Y, 1)))
 
         maskX = tf.sequence_mask(
@@ -107,8 +174,10 @@ def differentiable_expected_bleu(labels,
         o = []
 
         for order in range(max_order):
-            matchXY = XY[:, : sizeX - order, : sizeY - order] * matchXY[:, 1:, 1:]
-            matchYY = YY[:, : sizeY - order, : sizeY - order] * matchYY[:, 1:, 1:]
+            matchXY = XY[:, : sizeX - order, : sizeY - order] * \
+                      matchXY[:, 1:, 1:]
+            matchYY = YY[:, : sizeY - order, : sizeY - order] * \
+                      matchYY[:, 1:, 1:]
             cntYX = tf.reduce_sum(matchXY, 1, keepdims=True)
             cntYY = tf.reduce_sum(matchYY, 1, keepdims=True)
             o_order = tf.reduce_sum(tf.reduce_sum(

From d8f7449c2830679139e1cf5d4d5df0f8dd89a327 Mon Sep 17 00:00:00 2001
From: Zichao Yang <zichaoy@cs.cmu.edu>
Date: Sat, 6 Oct 2018 17:41:09 -0400
Subject: [PATCH 10/65] config dataset val=test

---
 examples/differentiable_expected_bleu/config_iwslt14.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/differentiable_expected_bleu/config_iwslt14.py b/examples/differentiable_expected_bleu/config_iwslt14.py
index 3fbff240..cfcc6d71 100644
--- a/examples/differentiable_expected_bleu/config_iwslt14.py
+++ b/examples/differentiable_expected_bleu/config_iwslt14.py
@@ -14,7 +14,6 @@
         'vocab_file': target_vocab_file,
         'max_seq_length': 50
     },
-    'allow_smaller_final_batch': False,
 }
 val = {
     'batch_size': 80,
@@ -40,3 +39,4 @@
         'vocab_file': target_vocab_file,
     },
 }
+val = test

From 9bdbe09780f00e9eaecf9cc70cd2f5119dddd85d Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Sun, 7 Oct 2018 01:30:21 -0400
Subject: [PATCH 11/65] add triggers ; now the whole code is runnable

---
 .../config_train.py                           |   4 +
 .../differentiable_expected_bleu.py           |  48 +++++--
 .../differentiable_expected_bleu/triggers.py  | 128 ++++++++++++++++++
 texar/modules/decoders/rnn_decoder_helpers.py |   5 +-
 4 files changed, 170 insertions(+), 15 deletions(-)
 create mode 100644 examples/differentiable_expected_bleu/triggers.py

diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py
index 2cbe8dd7..ac23d115 100644
--- a/examples/differentiable_expected_bleu/config_train.py
+++ b/examples/differentiable_expected_bleu/config_train.py
@@ -4,6 +4,10 @@
 infer_beam_width = 1
 infer_max_decoding_length = 50
 
+mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
+threshold_steps = 10000
+wait_steps = 10000
+
 train_xe = {
     "optimizer": {
         "type": "AdamOptimizer",
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index ee042be0..e1db02a6 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -24,19 +24,23 @@
 import os
 import tensorflow as tf
 import texar as tx
+from triggers import BestEverConvergenceTrigger
 
 flags = tf.flags
 
 flags.DEFINE_string("config_train", "config_train", "The training config.")
 flags.DEFINE_string("config_model", "config_model", "The model config.")
 flags.DEFINE_string("config_data", "config_iwslt14", "The dataset config.")
+flags.DEFINE_boolean("pretraining", False, "whether pretraining")
 
 FLAGS = flags.FLAGS
 
 config_train = importlib.import_module(FLAGS.config_train)
 config_model = importlib.import_module(FLAGS.config_model)
 config_data = importlib.import_module(FLAGS.config_data)
+pretraining = FLAGS.pretraining
 
+mask_patterns = config_train.mask_patterns
 
 def get_data_loader(sess, fetches, feed_dict):
     while True:
@@ -80,11 +84,12 @@ def build_model(batch, train_data):
 
     # teacher mask + DEBLEU fine-tuning
     tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper(
-        inputs=batch['target_text_ids'][:, :-1],
+        # must not remove last token, since it may be used as mask
+        inputs=batch['target_text_ids'],
         sequence_length=batch['target_length']-1,
         embedding=target_embedder,
-        n_unmask=1,
-        n_mask=0,
+        n_unmask=mask_patterns[0][0],
+        n_mask=mask_patterns[0][1],
         tau=config_train.tau)
 
     tm_outputs, _, _ = decoder(
@@ -111,7 +116,7 @@ def build_model(batch, train_data):
         beam_width=config_train.infer_beam_width,
         max_decoding_length=config_train.infer_max_decoding_length)
 
-    return train_xe_op, train_debleu_op, bs_outputs
+    return train_xe_op, train_debleu_op, tm_helper, bs_outputs
 
 
 def main():
@@ -127,14 +132,15 @@ def main():
 
     global_step = tf.train.create_global_step()
 
-    train_xe_op, train_debleu_op, infer_outputs = \
+    train_xe_op, train_debleu_op, tm_helper, infer_outputs = \
         build_model(data_batch, train_data)
+    train_op = train_xe_op if pretraining else train_debleu_op
 
     merged_summary = tf.summary.merge_all()
 
     saver = tf.train.Saver(max_to_keep=None)
 
-    def _train_epoch(sess, summary_writer):
+    def _train_epoch(sess, summary_writer, train_op, trigger):
         data_iterator.restart_dataset(sess, 'train')
         feed_dict = {
             tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
@@ -142,12 +148,12 @@ def _train_epoch(sess, summary_writer):
         }
 
         for loss, summary, step in get_data_loader(
-                sess, (train_xe_op, merged_summary, global_step), feed_dict):
+                sess, (train_op, merged_summary, global_step), feed_dict):
             summary_writer.add_summary(summary, step)
             if step % config_train.steps_per_eval == 0:
-                _eval_epoch(sess, summary_writer, 'val')
+                _eval_epoch(sess, summary_writer, 'val', trigger)
 
-    def _eval_epoch(sess, summary_writer, mode):
+    def _eval_epoch(sess, summary_writer, mode, trigger):
         data_iterator.restart_dataset(sess, mode)
         feed_dict = {
             tx.global_mode(): tf.estimator.ModeKeys.EVAL,
@@ -171,10 +177,17 @@ def _eval_epoch(sess, summary_writer, mode):
         refs, hypos = zip(*ref_hypo_pairs)
         bleu = tx.evals.corpus_bleu_moses(list_of_references=refs,
                                           hypotheses=hypos)
+
         step = tf.train.global_step(sess, global_step)
         summary = tf.Summary()
         summary.value.add(tag='{}/BLEU'.format(mode), simple_value=bleu)
         summary_writer.add_summary(summary, step)
+
+        if trigger is not None:
+            triggered, _ = trigger(step, bleu)
+            if triggered:
+                print('triggered!')
+
         return bleu
 
     best_val_bleu = -1
@@ -190,9 +203,22 @@ def _eval_epoch(sess, summary_writer, mode):
 
         summary_writer = tf.summary.FileWriter('log', sess.graph)
 
+        if pretraining:
+            trigger = None
+        else:
+            action = map(
+                lambda pattern: tm_helper.assign_mask_pattern(
+                    sess, pattern[0], pattern[1]),
+                mask_patterns[1:])
+            trigger = BestEverConvergenceTrigger(
+                action,
+                config_train.threshold_steps,
+                config_train.wait_steps,
+                default=None)
+
         epoch = 0
         while epoch < config_train.max_epochs:
-            val_bleu = _eval_epoch(sess, summary_writer, 'val')
+            val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger)
             if val_bleu > best_val_bleu:
                 best_val_bleu = val_bleu
                 print('epoch: {}, step: {}, best val bleu: {}'.format(
@@ -201,7 +227,7 @@ def _eval_epoch(sess, summary_writer, mode):
                     best_val_bleu))
                 saved_path = saver.save(sess, 'ckpt/best.ckpt')
                 print('saved to {}'.format(saved_path))
-            _train_epoch(sess, summary_writer)
+            _train_epoch(sess, summary_writer, train_op, trigger)
             epoch += 1
             saved_path = saver.save(sess, 'ckpt/model.ckpt')
             print('saved to {}'.format(saved_path))
diff --git a/examples/differentiable_expected_bleu/triggers.py b/examples/differentiable_expected_bleu/triggers.py
new file mode 100644
index 00000000..9b879048
--- /dev/null
+++ b/examples/differentiable_expected_bleu/triggers.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+# Copyright 2018 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Attentional Seq2seq.
+"""
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+#pylint: disable=invalid-name, too-many-arguments, too-many-locals
+
+try:
+    import queue
+except ImportError:
+    import Queue as queue
+
+DEFAULT = object()
+
+class Trigger(object):
+
+    def __init__(self, action, default=DEFAULT):
+        """action is an iterator that iteratively do a sequence of action and
+        return result values. default is used as result value when action is
+        exhausted.
+        """
+        self._action = action
+        self._default = default
+
+    def predicate(self, *args, **kwargs):
+        """This function returns True when we think we should do something.
+        """
+        raise NotImplementedError
+
+    def __call__(self, *args, **kwargs):
+        pred = self.predicate(*args, **kwargs)
+        if pred:
+            ret = next(self._action) if self._default is DEFAULT else \
+                  next(self._action, self._default)
+        else:
+            ret = None
+        return pred, ret
+
+
+class ScheduledStepsTrigger(Trigger):
+    
+    def __init__(self, action, steps, default=DEFAULT):
+        """steps should be in increasing order.
+        """
+        super(ScheduledTrigger, self).__init__(action, default)
+        self._steps = iter(steps)
+        self._advance_steps()
+
+    def _advance_steps(self):
+        self._next_step = next(step, None)
+
+    def predicate(self, step):
+        while self._next_step is not None and step < self._next_step:
+            self._advance_steps()
+        if self._next_step is not None and step == self._next_step:
+            return True
+        return False
+
+
+class BestEverConvergenceTrigger(Trigger):
+
+    def __init__(self, action, threshold_steps, wait_steps, default=DEFAULT):
+        super(BestEverConvergenceTrigger, self).__init__(action, default)
+        self._threshold_steps = threshold_steps
+        self._wait_steps = wait_steps
+        self._last_triggered_step = None
+        self._best_ever_step = None
+        self._best_ever_score = None
+
+    def predicate(self, step, score):
+        if self._best_ever_score is None or self._best_ever_score < score:
+            self._best_ever_score = score
+            self._best_ever_step = step
+
+        if (self._last_triggered_step is None or
+                step - self._last_triggered_step >= self._wait_steps) and \
+                step - self._best_ever_step >= self._threshold_steps:
+            self._last_triggered_step = step
+            return True
+        return False
+
+
+class MovingAverageConvergenceTrigger(Trigger):
+
+    def __init__(self, action, n, threshold, wait_steps, default=DEFAULT):
+        super(MovingAverageConvergenceTrigger, self).__init__(action, default)
+        self._n = n
+        self._threshold = threshold
+        self._wait_steps = wait_steps
+        self._last_triggered_step = None
+        self._head_queue = queue.Queue(self._n)
+        self._head_sum = 0
+        self._rear_queue = queue.Queue(self._n)
+        self._rear_sum = 0
+
+    def predicate(self, step, score):
+        if self._head_queue.full():
+            e = self._head_queue.get()
+            self._head_sum -= e
+            if self._rear_queue.full():
+                self._rear_sum -= self._rear_queue.get()
+            self._rear_queue.put(e)
+            self._rear_sum += e
+        self._head_queue.put(score)
+        self._head_sum += score
+
+        if (self._last_triggered_step is None or
+                step - self._last_triggered_step >= self._wait_steps) and \
+                self._head_queue.full() and self._rear_queue.full() and \
+                self._head_sum - self._rear_sum <= self._n * self._threshold:
+            self._last_triggered_step = step
+            return True
+        return False
diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py
index 1d442c9a..a9e2bd1c 100644
--- a/texar/modules/decoders/rnn_decoder_helpers.py
+++ b/texar/modules/decoders/rnn_decoder_helpers.py
@@ -403,8 +403,5 @@ def next_inputs(self, time, outputs, state, sample_ids, name=None):
         next_inputs = tf.cond(
             all_finished,
             lambda: self._zero_next_inputs,
-            lambda: tf.cond(  # for efficiency
-                self._is_masked(next_time),
-                lambda: self._embedding_fn(self._input_tas.read(next_time)),
-                lambda: tf.matmul(sample_ids, self._embedding)))
+            lambda: tf.matmul(sample_ids, self._embedding))
         return (finished, next_inputs, state)

From d04e4e0d222c6d36f0fad660952d5a2a2a8d0979 Mon Sep 17 00:00:00 2001
From: Zichao Yang <zichaoy@cs.cmu.edu>
Date: Sun, 7 Oct 2018 13:27:56 -0400
Subject: [PATCH 12/65] add learning rate

---
 examples/differentiable_expected_bleu/config_train.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py
index 2cbe8dd7..563156db 100644
--- a/examples/differentiable_expected_bleu/config_train.py
+++ b/examples/differentiable_expected_bleu/config_train.py
@@ -7,6 +7,9 @@
 train_xe = {
     "optimizer": {
         "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": 1e-5,
+        }
     },
     "gradient_clip": {
         "type": "clip_by_global_norm",
@@ -19,6 +22,9 @@
 train_debleu = {
     "optimizer": {
         "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": 1e-5,
+        }
     },
     "gradient_clip": {
         "type": "clip_by_global_norm",

From 3f74126a05d5e4bcc77c64b6c7b38b4548c3e3b7 Mon Sep 17 00:00:00 2001
From: Zichao Yang <zichaoy@cs.cmu.edu>
Date: Mon, 8 Oct 2018 00:22:34 -0400
Subject: [PATCH 13/65] add mask summary ; fix action

---
 .../differentiable_expected_bleu.py           | 120 +++++++++++-------
 texar/modules/decoders/rnn_decoder_helpers.py |   8 ++
 2 files changed, 84 insertions(+), 44 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index e1db02a6..8750f059 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -40,8 +40,24 @@
 config_data = importlib.import_module(FLAGS.config_data)
 pretraining = FLAGS.pretraining
 
+expr_name = config_train.expr_name
 mask_patterns = config_train.mask_patterns
 
+def optimistic_restore(session, save_file, graph=tf.get_default_graph()):
+    reader = tf.train.NewCheckpointReader(save_file)
+    saved_shapes = reader.get_variable_to_shape_map()
+    var_names = sorted([
+        (var.name, var.name.split(':')[0]) for var in tf.global_variables()
+        if var.name.split(':')[0] in saved_shapes])
+    restore_vars = []
+    for var_name, saved_var_name in var_names:
+        curr_var = graph.get_tensor_by_name(var_name)
+        var_shape = curr_var.get_shape().as_list()
+        if var_shape == saved_shapes[saved_var_name]:
+            restore_vars.append(curr_var)
+    opt_saver = tf.train.Saver(restore_vars)
+    opt_saver.restore(session, save_file)
+
 def get_data_loader(sess, fetches, feed_dict):
     while True:
         try:
@@ -69,39 +85,48 @@ def build_model(batch, train_data):
         vocab_size=train_data.target_vocab.size,
         hparams=config_model.decoder)
 
-    # cross-entropy + teacher-forcing pretraining
-    tf_outputs, _, _ = decoder(
-        decoding_strategy='train_greedy',
-        inputs=target_embedder(batch['target_text_ids'][:, :-1]),
-        sequence_length=batch['target_length']-1)
-
-    train_xe_op = tx.core.get_train_op(
-        tx.losses.sequence_sparse_softmax_cross_entropy(
-            labels=batch['target_text_ids'][:, 1:],
-            logits=tf_outputs.logits,
-            sequence_length=batch['target_length']-1),
-        hparams=config_train.train_xe)
-
-    # teacher mask + DEBLEU fine-tuning
-    tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper(
-        # must not remove last token, since it may be used as mask
-        inputs=batch['target_text_ids'],
-        sequence_length=batch['target_length']-1,
-        embedding=target_embedder,
-        n_unmask=mask_patterns[0][0],
-        n_mask=mask_patterns[0][1],
-        tau=config_train.tau)
-
-    tm_outputs, _, _ = decoder(
-        helper=tm_helper)
-
-    train_debleu_op = tx.core.get_train_op(
-        tx.losses.differentiable_expected_bleu(
-            #TODO: decide whether to include BOS
-            labels=batch['target_text_ids'][:, 1:],
-            probs=tm_outputs.sample_id,
-            sequence_length=batch['target_length']-1),
-        hparams=config_train.train_debleu)
+    if pretraining:
+        # cross-entropy + teacher-forcing pretraining
+        tf_outputs, _, _ = decoder(
+            decoding_strategy='train_greedy',
+            inputs=target_embedder(batch['target_text_ids'][:, :-1]),
+            sequence_length=batch['target_length']-1)
+
+        train_xe_op = tx.core.get_train_op(
+            tx.losses.sequence_sparse_softmax_cross_entropy(
+                labels=batch['target_text_ids'][:, 1:],
+                logits=tf_outputs.logits,
+                sequence_length=batch['target_length']-1),
+            hparams=config_train.train_xe)
+    else:
+        train_xe_op = None
+
+    if not pretraining:
+        # teacher mask + DEBLEU fine-tuning
+        tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper(
+            # must not remove last token, since it may be used as mask
+            inputs=batch['target_text_ids'],
+            sequence_length=batch['target_length']-1,
+            embedding=target_embedder,
+            n_unmask=mask_patterns[0][0],
+            n_mask=mask_patterns[0][1],
+            tau=config_train.tau)
+        tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask)
+        tf.summary.scalar('tm/n_mask', tm_helper.n_mask)
+
+        tm_outputs, _, _ = decoder(
+            helper=tm_helper)
+
+        train_debleu_op = tx.core.get_train_op(
+            tx.losses.differentiable_expected_bleu(
+                #TODO: decide whether to include BOS
+                labels=batch['target_text_ids'][:, 1:],
+                probs=tm_outputs.sample_id,
+                sequence_length=batch['target_length']-1),
+            hparams=config_train.train_debleu)
+    else:
+        tm_helper = None
+        train_debleu_op = None
 
     # inference: beam search decoding
     start_tokens = tf.ones_like(batch['target_length']) * \
@@ -141,6 +166,7 @@ def main():
     saver = tf.train.Saver(max_to_keep=None)
 
     def _train_epoch(sess, summary_writer, train_op, trigger):
+        print('in _train_epoch')
         data_iterator.restart_dataset(sess, 'train')
         feed_dict = {
             tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
@@ -153,7 +179,10 @@ def _train_epoch(sess, summary_writer, train_op, trigger):
             if step % config_train.steps_per_eval == 0:
                 _eval_epoch(sess, summary_writer, 'val', trigger)
 
+        print('end _train_epoch')
+
     def _eval_epoch(sess, summary_writer, mode, trigger):
+        print('in _eval_epoch with mode {}'.format(mode))
         data_iterator.restart_dataset(sess, mode)
         feed_dict = {
             tx.global_mode(): tf.estimator.ModeKeys.EVAL,
@@ -182,34 +211,36 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
         summary = tf.Summary()
         summary.value.add(tag='{}/BLEU'.format(mode), simple_value=bleu)
         summary_writer.add_summary(summary, step)
+        summary_writer.flush()
 
         if trigger is not None:
             triggered, _ = trigger(step, bleu)
             if triggered:
                 print('triggered!')
 
+        print('end _eval_epoch')
         return bleu
 
     best_val_bleu = -1
     with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        sess.run(tf.local_variables_initializer())
         sess.run(tf.tables_initializer())
-        ckpt_name = 'ckpt/model.ckpt'
-        if os.path.exists('ckpt') and tf.train.checkpoint_exists(ckpt_name):
+        ckpt_path = os.path.join(expr_name, 'ckpt')
+        ckpt_name = os.path.join(ckpt_path, 'model.ckpt')
+        if os.path.exists(ckpt_path) and tf.train.checkpoint_exists(ckpt_name):
             print('restoring from {} ...'.format(ckpt_name))
-            saver.restore(sess, ckpt_name)
-        else:
-            sess.run(tf.global_variables_initializer())
-            sess.run(tf.local_variables_initializer())
+            optimistic_restore(sess, ckpt_name)
+            print('done.')
 
-        summary_writer = tf.summary.FileWriter('log', sess.graph)
+        summary_writer = tf.summary.FileWriter(
+            os.path.join(expr_name, 'log'), sess.graph, flush_secs=30)
 
         if pretraining:
             trigger = None
         else:
-            action = map(
-                lambda pattern: tm_helper.assign_mask_pattern(
-                    sess, pattern[0], pattern[1]),
-                mask_patterns[1:])
+            action = (tm_helper.assign_mask_pattern(sess, n_unmask, n_mask)
+                      for n_unmask, n_mask in mask_patterns[1:])
             trigger = BestEverConvergenceTrigger(
                 action,
                 config_train.threshold_steps,
@@ -218,6 +249,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
 
         epoch = 0
         while epoch < config_train.max_epochs:
+            print('epoch #{}:'.format(epoch))
             val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger)
             if val_bleu > best_val_bleu:
                 best_val_bleu = val_bleu
diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py
index a9e2bd1c..bfe7cb99 100644
--- a/texar/modules/decoders/rnn_decoder_helpers.py
+++ b/texar/modules/decoders/rnn_decoder_helpers.py
@@ -366,6 +366,14 @@ def sample_ids_dtype(self):
     def sample_ids_shape(self):
         return self._embedding.get_shape()[:1]
 
+    @property
+    def n_unmask(self):
+        return self._n_unmask
+
+    @property
+    def n_mask(self):
+        return self._n_mask
+
     def assign_mask_pattern(self, sess, n_unmask, n_mask):
         sess.run([self._assign_n_unmask, self._assign_n_mask],
                  feed_dict={self._new_n_unmask: n_unmask,

From 69129f4bac5d553a657de8156bebba3048d3ce7c Mon Sep 17 00:00:00 2001
From: Zichao Yang <zichaoy@cs.cmu.edu>
Date: Mon, 8 Oct 2018 14:25:25 -0400
Subject: [PATCH 14/65] fix random shift bug

---
 examples/differentiable_expected_bleu/config_train.py | 2 ++
 texar/modules/decoders/rnn_decoder_helpers.py         | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py
index 890781c1..077f54f9 100644
--- a/examples/differentiable_expected_bleu/config_train.py
+++ b/examples/differentiable_expected_bleu/config_train.py
@@ -37,3 +37,5 @@
         },
     },
 }
+
+expr_name = 'xe_1e3_debleu_1e5'
diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py
index bfe7cb99..c12447d7 100644
--- a/texar/modules/decoders/rnn_decoder_helpers.py
+++ b/texar/modules/decoders/rnn_decoder_helpers.py
@@ -380,7 +380,7 @@ def assign_mask_pattern(self, sess, n_unmask, n_mask):
                             self._new_n_mask: n_mask})
 
     def _is_masked(self, time):
-        return time % self._n_cycle < self._n_mask
+        return (time + self._n_shift) % self._n_cycle < self._n_mask
 
     def initialize(self, name=None):
         finished = tf.equal(0, self._sequence_length)

From 142665f6137bc2f7b7b6656aa274c371b7512071 Mon Sep 17 00:00:00 2001
From: Zichao Yang <zichaoy@cs.cmu.edu>
Date: Mon, 8 Oct 2018 15:33:33 -0400
Subject: [PATCH 15/65] don't restore Adam status

---
 .../differentiable_expected_bleu.py                          | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 8750f059..f7948cf9 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -55,6 +55,10 @@ def optimistic_restore(session, save_file, graph=tf.get_default_graph()):
         var_shape = curr_var.get_shape().as_list()
         if var_shape == saved_shapes[saved_var_name]:
             restore_vars.append(curr_var)
+    restore_vars = list(filter(
+        lambda var: var.name.split('/')[0] != 'OptimizeLoss', restore_vars))
+    print('restoring variables:\n{}'.format('\n'.join(
+        var.name for var in restore_vars)))
     opt_saver = tf.train.Saver(restore_vars)
     opt_saver.restore(session, save_file)
 
@@ -124,6 +128,7 @@ def build_model(batch, train_data):
                 probs=tm_outputs.sample_id,
                 sequence_length=batch['target_length']-1),
             hparams=config_train.train_debleu)
+
     else:
         tm_helper = None
         train_debleu_op = None

From c836e538afc71ddf3506384e24728d27faed8187 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Mon, 8 Oct 2018 18:36:50 -0400
Subject: [PATCH 16/65] fix save path

---
 .../differentiable_expected_bleu.py               | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index f7948cf9..7e3f6fde 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -168,7 +168,7 @@ def main():
 
     merged_summary = tf.summary.merge_all()
 
-    saver = tf.train.Saver(max_to_keep=None)
+    saver = tf.train.Saver(max_to_keep=0)
 
     def _train_epoch(sess, summary_writer, train_op, trigger):
         print('in _train_epoch')
@@ -232,10 +232,11 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
         sess.run(tf.local_variables_initializer())
         sess.run(tf.tables_initializer())
         ckpt_path = os.path.join(expr_name, 'ckpt')
-        ckpt_name = os.path.join(ckpt_path, 'model.ckpt')
-        if os.path.exists(ckpt_path) and tf.train.checkpoint_exists(ckpt_name):
-            print('restoring from {} ...'.format(ckpt_name))
-            optimistic_restore(sess, ckpt_name)
+        ckpt_model = os.path.join(ckpt_path, 'model.ckpt')
+        ckpt_best = os.path.join(ckpt_path, 'best.ckpt')
+        if os.path.exists(ckpt_path) and tf.train.checkpoint_exists(ckpt_model):
+            print('restoring from {} ...'.format(ckpt_model))
+            optimistic_restore(sess, ckpt_model)
             print('done.')
 
         summary_writer = tf.summary.FileWriter(
@@ -262,11 +263,11 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
                     epoch,
                     tf.train.global_step(sess, global_step),
                     best_val_bleu))
-                saved_path = saver.save(sess, 'ckpt/best.ckpt')
+                saved_path = saver.save(sess, os.path.join(ckpt_best))
                 print('saved to {}'.format(saved_path))
             _train_epoch(sess, summary_writer, train_op, trigger)
             epoch += 1
-            saved_path = saver.save(sess, 'ckpt/model.ckpt')
+            saved_path = saver.save(sess, ckpt_model)
             print('saved to {}'.format(saved_path))
 
 

From ffe568ddbe0ce1527275fe3bfb21ff9794ed116a Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Mon, 8 Oct 2018 18:43:55 -0400
Subject: [PATCH 17/65] add flags.restore_adam

---
 .../differentiable_expected_bleu.py                      | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 7e3f6fde..a31e5632 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -31,7 +31,8 @@
 flags.DEFINE_string("config_train", "config_train", "The training config.")
 flags.DEFINE_string("config_model", "config_model", "The model config.")
 flags.DEFINE_string("config_data", "config_iwslt14", "The dataset config.")
-flags.DEFINE_boolean("pretraining", False, "whether pretraining")
+flags.DEFINE_boolean("pretraining", False, "Whether pretraining.")
+flags.DEFINE_boolean("restore_adam", False, "Whether to restore Adam states.")
 
 FLAGS = flags.FLAGS
 
@@ -39,6 +40,7 @@
 config_model = importlib.import_module(FLAGS.config_model)
 config_data = importlib.import_module(FLAGS.config_data)
 pretraining = FLAGS.pretraining
+restore_adam = FLAGS.restore_adam
 
 expr_name = config_train.expr_name
 mask_patterns = config_train.mask_patterns
@@ -55,8 +57,9 @@ def optimistic_restore(session, save_file, graph=tf.get_default_graph()):
         var_shape = curr_var.get_shape().as_list()
         if var_shape == saved_shapes[saved_var_name]:
             restore_vars.append(curr_var)
-    restore_vars = list(filter(
-        lambda var: var.name.split('/')[0] != 'OptimizeLoss', restore_vars))
+    if not restore_adam:
+        restore_vars = list(filter(
+            lambda var: var.name.split('/')[0] != 'OptimizeLoss', restore_vars))
     print('restoring variables:\n{}'.format('\n'.join(
         var.name for var in restore_vars)))
     opt_saver = tf.train.Saver(restore_vars)

From 73d1c7b88b47967c4731627012a7f88b8efbec53 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Mon, 8 Oct 2018 19:49:46 -0400
Subject: [PATCH 18/65] add global_step onto saved ckpt

---
 .../differentiable_expected_bleu.py                 | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index a31e5632..4d6c12d6 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -171,7 +171,7 @@ def main():
 
     merged_summary = tf.summary.merge_all()
 
-    saver = tf.train.Saver(max_to_keep=0)
+    saver = tf.train.Saver(max_to_keep=None)
 
     def _train_epoch(sess, summary_writer, train_op, trigger):
         print('in _train_epoch')
@@ -262,15 +262,16 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
             val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger)
             if val_bleu > best_val_bleu:
                 best_val_bleu = val_bleu
+                step = tf.train.global_step(sess, global_step)
                 print('epoch: {}, step: {}, best val bleu: {}'.format(
-                    epoch,
-                    tf.train.global_step(sess, global_step),
-                    best_val_bleu))
-                saved_path = saver.save(sess, os.path.join(ckpt_best))
+                    epoch, step, best_val_bleu))
+                saved_path = saver.save(
+                    sess, os.path.join(ckpt_best), global_step=step)
                 print('saved to {}'.format(saved_path))
             _train_epoch(sess, summary_writer, train_op, trigger)
             epoch += 1
-            saved_path = saver.save(sess, ckpt_model)
+            step = tf.train.global_step(sess, global_step)
+            saved_path = saver.save(sess, ckpt_model, global_step=step)
             print('saved to {}'.format(saved_path))
 
 

From bf92f2ee2d08fa052b05b08a9fae04d1661e8c12 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Wed, 10 Oct 2018 18:15:16 -0400
Subject: [PATCH 19/65] add flags.restore_mask

---
 ...g_train.py => config_en-fr_xe_1e3_xe_1e5_debleu.py} |  2 +-
 .../differentiable_expected_bleu.py                    | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)
 rename examples/differentiable_expected_bleu/{config_train.py => config_en-fr_xe_1e3_xe_1e5_debleu.py} (94%)

diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_en-fr_xe_1e3_xe_1e5_debleu.py
similarity index 94%
rename from examples/differentiable_expected_bleu/config_train.py
rename to examples/differentiable_expected_bleu/config_en-fr_xe_1e3_xe_1e5_debleu.py
index 077f54f9..07acbea8 100644
--- a/examples/differentiable_expected_bleu/config_train.py
+++ b/examples/differentiable_expected_bleu/config_en-fr_xe_1e3_xe_1e5_debleu.py
@@ -38,4 +38,4 @@
     },
 }
 
-expr_name = 'xe_1e3_debleu_1e5'
+expr_name = 'en-fr_xe_1e3_xe_1e5_debleu'
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 4d6c12d6..7a3c1162 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -33,6 +33,7 @@
 flags.DEFINE_string("config_data", "config_iwslt14", "The dataset config.")
 flags.DEFINE_boolean("pretraining", False, "Whether pretraining.")
 flags.DEFINE_boolean("restore_adam", False, "Whether to restore Adam states.")
+flags.DEFINE_boolean("restore_mask", False, "Whether to restore mask patterns.")
 
 FLAGS = flags.FLAGS
 
@@ -41,6 +42,7 @@
 config_data = importlib.import_module(FLAGS.config_data)
 pretraining = FLAGS.pretraining
 restore_adam = FLAGS.restore_adam
+restore_mask = FLAGS.restore_mask
 
 expr_name = config_train.expr_name
 mask_patterns = config_train.mask_patterns
@@ -59,7 +61,13 @@ def optimistic_restore(session, save_file, graph=tf.get_default_graph()):
             restore_vars.append(curr_var)
     if not restore_adam:
         restore_vars = list(filter(
-            lambda var: var.name.split('/')[0] != 'OptimizeLoss', restore_vars))
+            lambda var: var.name.split(':')[0].split('/')[0] != 'OptimizeLoss',
+            restore_vars))
+    if not restore_mask:
+        restore_vars = list(filter(
+            lambda var: var.name.split(':')[0].split('/')[0] not in \
+                ['n_unmask', 'n_mask'],
+            restore_vars))
     print('restoring variables:\n{}'.format('\n'.join(
         var.name for var in restore_vars)))
     opt_saver = tf.train.Saver(restore_vars)

From 9fe74cb2e80e88d23540d16099fc3e6c6d6bb139 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sat, 13 Oct 2018 14:37:33 -0400
Subject: [PATCH 20/65] remove config_model_full.py ; rename debleu ; rename
 some arguments ; build entire model together

---
 .../config_model_full.py                      | 127 ---------------
 .../differentiable_expected_bleu.py           |  82 +++++-----
 texar/losses/__init__.py                      |   2 +-
 ...ferentiable_expected_bleu.py => debleu.py} | 153 +++++++++---------
 4 files changed, 114 insertions(+), 250 deletions(-)
 delete mode 100644 examples/differentiable_expected_bleu/config_model_full.py
 rename texar/losses/{differentiable_expected_bleu.py => debleu.py} (55%)

diff --git a/examples/differentiable_expected_bleu/config_model_full.py b/examples/differentiable_expected_bleu/config_model_full.py
deleted file mode 100644
index b59ebc4e..00000000
--- a/examples/differentiable_expected_bleu/config_model_full.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# The full possible hyperparameters for the attentional seq2seq model.
-# Most of the hyperparameters take the default values and are not necessary to
-# specify explicitly. The config here results in the same model with the
-# `config_model.py`.
-
-num_units = 256
-beam_width = 10
-
-# --------------------- Embedder --------------------- #
-embedder = {
-    'dim': num_units,
-    'initializer': {
-        'type': 'random_uniform_initializer',
-        'kwargs': {
-            'minval': -0.1,
-            'maxval': 0.1,
-            'seed': None
-        },
-    },
-    'regularizer': {
-        'type': 'L1L2',
-        'kwargs': {
-            'l1': 0,
-            'l2': 0
-        }
-    },
-    'dropout_rate': 0,
-    'dropout_strategy': 'element',
-    'trainable': True,
-    'name': 'word_embedder'
-}
-
-# --------------------- Encoder --------------------- #
-encoder = {
-    'rnn_cell_fw': {
-        'type': 'LSTMCell',
-        'kwargs': {
-            'num_units': num_units,
-            'forget_bias': 1.0,
-            'activation': None,
-            # Other arguments go here for tf.nn.rnn_cell.LSTMCell
-            # ...
-        },
-        'num_layers': 1,
-        'dropout': {
-            'input_keep_prob': 1.0,
-            'output_keep_prob': 1.0,
-            'state_keep_prob': 1.0,
-            'variational_recurrent': False,
-            'input_size': [],
-        },
-        'residual': False,
-        'highway': False,
-    },
-    'rnn_cell_bw': {
-        # The same possible hyperparameters as with 'rnn_cell_fw'
-        # ...
-    },
-    'rnn_cell_share_config': True,
-    'output_layer_fw': {
-        'num_layers': 0,
-        'layer_size': 128,
-        'activation': 'identity',
-        'final_layer_activation': None,
-        'other_dense_kwargs': None,
-        'dropout_layer_ids': [],
-        'dropout_rate': 0.5,
-        'variational_dropout': False
-    },
-    'output_layer_bw': {
-        # The same possible hyperparameters as with 'output_layer_fw'
-        # ...
-    },
-    'output_layer_share_config': True,
-    'name': 'bidirectional_rnn_encoder'
-}
-
-# --------------------- Decoder --------------------- #
-decoder = {
-    'rnn_cell': {
-        'type': 'LSTMCell',
-        'kwargs': {
-            'num_units': num_units,
-            'forget_bias': 1.0,
-            'activation': None,
-            # Other arguments go here for tf.nn.rnn_cell.LSTMCell
-            # ...
-        },
-        'num_layers': 1,
-        'dropout': {
-            'input_keep_prob': 1.0,
-            'output_keep_prob': 1.0,
-            'state_keep_prob': 1.0,
-            'variational_recurrent': False,
-            'input_size': [],
-        },
-        'residual': False,
-        'highway': False,
-    },
-    'attention': {
-        'type': 'LuongAttention',
-        'kwargs': {
-            'num_units': num_units,
-            'scale': False,
-            'probability_fn': None,
-            'score_mask_value': None,
-            # Other arguments go here for tf.contrib.seq2seq.LuongAttention
-            # ...
-        },
-        'attention_layer_size': num_units,
-        'alignment_history': False,
-        'output_attention': True,
-    },
-    'helper_train': {
-        'type': 'TrainingHelper',
-        'kwargs': {
-            # Arguments go here for tf.contrib.seq2seq.TrainingHelper
-        }
-    },
-    'helper_infer': {
-        # The same possible hyperparameters as with 'helper_train'
-        # ...
-    },
-    'max_decoding_length_train': None,
-    'max_decoding_length_infer': None,
-    'name': 'attention_rnn_decoder'
-}
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 7a3c1162..b8dcaa8d 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -100,49 +100,42 @@ def build_model(batch, train_data):
         vocab_size=train_data.target_vocab.size,
         hparams=config_model.decoder)
 
-    if pretraining:
-        # cross-entropy + teacher-forcing pretraining
-        tf_outputs, _, _ = decoder(
-            decoding_strategy='train_greedy',
-            inputs=target_embedder(batch['target_text_ids'][:, :-1]),
-            sequence_length=batch['target_length']-1)
-
-        train_xe_op = tx.core.get_train_op(
-            tx.losses.sequence_sparse_softmax_cross_entropy(
-                labels=batch['target_text_ids'][:, 1:],
-                logits=tf_outputs.logits,
-                sequence_length=batch['target_length']-1),
-            hparams=config_train.train_xe)
-    else:
-        train_xe_op = None
-
-    if not pretraining:
-        # teacher mask + DEBLEU fine-tuning
-        tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper(
-            # must not remove last token, since it may be used as mask
-            inputs=batch['target_text_ids'],
-            sequence_length=batch['target_length']-1,
-            embedding=target_embedder,
-            n_unmask=mask_patterns[0][0],
-            n_mask=mask_patterns[0][1],
-            tau=config_train.tau)
-        tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask)
-        tf.summary.scalar('tm/n_mask', tm_helper.n_mask)
-
-        tm_outputs, _, _ = decoder(
-            helper=tm_helper)
-
-        train_debleu_op = tx.core.get_train_op(
-            tx.losses.differentiable_expected_bleu(
-                #TODO: decide whether to include BOS
-                labels=batch['target_text_ids'][:, 1:],
-                probs=tm_outputs.sample_id,
-                sequence_length=batch['target_length']-1),
-            hparams=config_train.train_debleu)
-
-    else:
-        tm_helper = None
-        train_debleu_op = None
+    # cross-entropy + teacher-forcing pretraining
+    tf_outputs, _, _ = decoder(
+        decoding_strategy='train_greedy',
+        inputs=target_embedder(batch['target_text_ids'][:, :-1]),
+        sequence_length=batch['target_length']-1)
+
+    loss_xe = tx.losses.sequence_sparse_softmax_cross_entropy(
+        labels=batch['target_text_ids'][:, 1:],
+        logits=tf_outputs.logits,
+        sequence_length=batch['target_length']-1)
+
+    train_xe_op = tx.core.get_train_op(
+        loss_xe,
+        hparams=config_train.train_xe)
+
+    # teacher mask + DEBLEU fine-tuning
+    tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper(
+        # must not remove last token, since it may be used as mask
+        inputs=batch['target_text_ids'],
+        sequence_length=batch['target_length']-1,
+        embedding=target_embedder,
+        n_unmask=mask_patterns[0][0],
+        n_mask=mask_patterns[0][1],
+        tau=config_train.tau)
+
+    tm_outputs, _, _ = decoder(
+        helper=tm_helper)
+
+    loss_debleu = tx.losses.debleu(
+        labels=batch['target_text_ids'][:, 1:],
+        probs=tm_outputs.sample_id,
+        sequence_length=batch['target_length']-1)
+
+    train_debleu_op = tx.core.get_train_op(
+        loss_debleu,
+        hparams=config_train.train_debleu)
 
     # inference: beam search decoding
     start_tokens = tf.ones_like(batch['target_length']) * \
@@ -177,6 +170,9 @@ def main():
         build_model(data_batch, train_data)
     train_op = train_xe_op if pretraining else train_debleu_op
 
+    tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask)
+    tf.summary.scalar('tm/n_mask', tm_helper.n_mask)
+
     merged_summary = tf.summary.merge_all()
 
     saver = tf.train.Saver(max_to_keep=None)
diff --git a/texar/losses/__init__.py b/texar/losses/__init__.py
index 48586d40..c8d09cfc 100644
--- a/texar/losses/__init__.py
+++ b/texar/losses/__init__.py
@@ -27,4 +27,4 @@
 from texar.losses.adv_losses import *
 from texar.losses.rewards import *
 from texar.losses.entropy import *
-from texar.losses.differentiable_expected_bleu import *
+from texar.losses.debleu import *
diff --git a/texar/losses/differentiable_expected_bleu.py b/texar/losses/debleu.py
similarity index 55%
rename from texar/losses/differentiable_expected_bleu.py
rename to texar/losses/debleu.py
index 1f3cc1f7..eeb9ba04 100644
--- a/texar/losses/differentiable_expected_bleu.py
+++ b/texar/losses/debleu.py
@@ -25,85 +25,80 @@
 # pylint: disable=too-many-arguments
 
 __all__ = [
-    "differentiable_expected_bleu",
+    "debleu",
 ]
 
 def batch_gather(params, indices, name=None):
-  """This function is copied and modified from tensorflow 11.0.
-  Gather slices from `params` according to `indices` with leading batch dims.
-  This operation assumes that the leading dimensions of `indices` are dense,
-  and the gathers on the axis corresponding to the last dimension of `indices`.
-  More concretely it computes:
-  result[i1, ..., in] = params[i1, ..., in-1, indices[i1, ..., in]]
-  Therefore `params` should be a Tensor of shape [A1, ..., AN, B1, ..., BM],
-  `indices` should be a Tensor of shape [A1, ..., AN-1, C] and `result` will be
-  a Tensor of size `[A1, ..., AN-1, C, B1, ..., BM]`.
-  In the case in which indices is a 1D tensor, this operation is equivalent to
-  `tf.gather`.
-  See also `tf.gather` and `tf.gather_nd`.
-  Args:
-    params: A Tensor. The tensor from which to gather values.
-    indices: A Tensor. Must be one of the following types: int32, int64. Index
-        tensor. Must be in range `[0, params.shape[axis]`, where `axis` is the
-        last dimension of `indices` itself.
-    name: A name for the operation (optional).
-  Returns:
-    A Tensor. Has the same type as `params`.
-  Raises:
-    ValueError: if `indices` has an unknown shape.
-  """
-
-  with tf.name_scope(name):
-    indices = tf.convert_to_tensor(indices, name="indices")
-    params = tf.convert_to_tensor(params, name="params")
-    indices_shape = tf.shape(indices)
-    params_shape = tf.shape(params)
-
-    ndims = indices.shape.ndims
-    if ndims is None:
-      raise ValueError("batch_gather does not allow indices with unknown "
-                       "shape.")
-    batch_indices = indices
-    indices_dtype = indices.dtype.base_dtype
-    accum_dim_value = tf.ones((), dtype=indices_dtype)
-    # Use correct type for offset index computation
-    casted_params_shape = tf.cast(params_shape, indices_dtype)
-    for dim in range(ndims-1, 0, -1):
-      dim_value = casted_params_shape[dim-1]
-      accum_dim_value *= casted_params_shape[dim]
-      start = tf.zeros((), dtype=indices_dtype)
-      step = tf.ones((), dtype=indices_dtype)
-      dim_indices = tf.range(start, dim_value, step)
-      dim_indices *= accum_dim_value
-      dim_shape = tf.stack([1] * (dim - 1) + [dim_value] + [1] * (ndims - dim),
-                           axis=0)
-      batch_indices += tf.reshape(dim_indices, dim_shape)
-
-    flat_indices = tf.reshape(batch_indices, [-1])
-    outer_shape = params_shape[ndims:]
-    flat_inner_shape = tf.reduce_prod(params_shape[:ndims])
-
-    flat_params = tf.reshape(
-        params, tf.concat([[flat_inner_shape], outer_shape], axis=0))
-    flat_result = tf.gather(flat_params, flat_indices)
-    result = tf.reshape(
-        flat_result, tf.concat([indices_shape, outer_shape], axis=0))
-    final_shape = indices.get_shape()[:ndims-1].merge_with(
-        params.get_shape()[:ndims -1])
-    final_shape = final_shape.concatenate(indices.get_shape()[ndims-1])
-    final_shape = final_shape.concatenate(params.get_shape()[ndims:])
-    result.set_shape(final_shape)
-    return result
-
-def differentiable_expected_bleu(labels,
-                                 probs,
-                                 sequence_length,
-                                 time_major=False,
-                                 min_fn=lambda x: tf.minimum(1., x),
-                                 max_order=4,
-                                 weights=[.1, .3, .3, .3],
-                                 smooth_add=1e-9,
-                                 name=None):
+    """This function is copied and modified from tensorflow 11.0. See
+    https://www.tensorflow.org/api_docs/python/tf/batch_gather for details.
+    Gather slices from `params` according to `indices` with leading batch dims.
+    This operation assumes that the leading dimensions of `indices` are dense,
+    and the gathers on the axis corresponding to the last dimension of `indices`.
+    More concretely it computes:
+    result[i1, ..., in] = params[i1, ..., in-1, indices[i1, ..., in]]
+    Therefore `params` should be a Tensor of shape [A1, ..., AN, B1, ..., BM],
+    `indices` should be a Tensor of shape [A1, ..., AN-1, C] and `result` will be
+    a Tensor of size `[A1, ..., AN-1, C, B1, ..., BM]`.
+    In the case in which indices is a 1D tensor, this operation is equivalent to
+    `tf.gather`.
+    See also `tf.gather` and `tf.gather_nd`.
+    Args:
+      params: A Tensor. The tensor from which to gather values.
+      indices: A Tensor. Must be one of the following types: int32, int64. Index
+          tensor. Must be in range `[0, params.shape[axis]`, where `axis` is the
+          last dimension of `indices` itself.
+      name: A name for the operation (optional).
+    Returns:
+      A Tensor. Has the same type as `params`.
+    Raises:
+      ValueError: if `indices` has an unknown shape.
+    """
+
+    with tf.name_scope(name):
+        indices = tf.convert_to_tensor(indices, name="indices")
+        params = tf.convert_to_tensor(params, name="params")
+        indices_shape = tf.shape(indices)
+        params_shape = tf.shape(params)
+
+        ndims = indices.shape.ndims
+        if ndims is None:
+            raise ValueError("batch_gather does not allow indices with unknown "
+                             "shape.")
+        batch_indices = indices
+        indices_dtype = indices.dtype.base_dtype
+        accum_dim_value = tf.ones((), dtype=indices_dtype)
+        # Use correct type for offset index computation
+        casted_params_shape = tf.cast(params_shape, indices_dtype)
+        for dim in range(ndims-1, 0, -1):
+            dim_value = casted_params_shape[dim-1]
+            accum_dim_value *= casted_params_shape[dim]
+            start = tf.zeros((), dtype=indices_dtype)
+            step = tf.ones((), dtype=indices_dtype)
+            dim_indices = tf.range(start, dim_value, step)
+            dim_indices *= accum_dim_value
+            dim_shape = tf.stack(
+                [1] * (dim - 1) + [dim_value] + [1] * (ndims - dim), axis=0)
+            batch_indices += tf.reshape(dim_indices, dim_shape)
+
+        flat_indices = tf.reshape(batch_indices, [-1])
+        outer_shape = params_shape[ndims:]
+        flat_inner_shape = tf.reduce_prod(params_shape[:ndims])
+
+        flat_params = tf.reshape(
+            params, tf.concat([[flat_inner_shape], outer_shape], axis=0))
+        flat_result = tf.gather(flat_params, flat_indices)
+        result = tf.reshape(
+            flat_result, tf.concat([indices_shape, outer_shape], axis=0))
+        final_shape = indices.get_shape()[:ndims-1].merge_with(
+            params.get_shape()[:ndims -1])
+        final_shape = final_shape.concatenate(indices.get_shape()[ndims-1])
+        final_shape = final_shape.concatenate(params.get_shape()[ndims:])
+        result.set_shape(final_shape)
+        return result
+
+def debleu(labels, probs, sequence_length, time_major=False,
+           min_fn=lambda x: tf.minimum(1., x), max_order=4,
+           weights=[.1, .3, .3, .3], epsilon=1e-9, name=None):
     """Computes sparse softmax cross entropy for each time step of sequence
     predictions.
 
@@ -148,7 +143,7 @@ def differentiable_expected_bleu(labels,
                 sequence_length=data_batch['length']-1)
 
     """ # TODO: rewrite example
-    with tf.name_scope(name, "sequence_sparse_softmax_cross_entropy"):
+    with tf.name_scope(name, "debleu"):
         X = probs
         Y = labels
 
@@ -184,7 +179,7 @@ def differentiable_expected_bleu(labels,
                 min_fn(cntYY / (cntYX - matchXY + 1))
                 * matchXY / tf.maximum(1., cntYY),
                 2), 1)
-            # in order to avoid dividing 0
+            # in order to avoid being divided by 0
             tot_order = tf.maximum(1, sequence_length - order)
             tot.append(tot_order)
             o.append(o_order)
@@ -192,7 +187,7 @@ def differentiable_expected_bleu(labels,
         tot = tf.stack(tot, 1)
         o = tf.stack(o, 1)
         prec = tf.reduce_sum(o, 0) / tf.to_float(tf.reduce_sum(tot, 0))
-        neglog_prec = -tf.log(prec + smooth_add)
+        neglog_prec = -tf.log(prec + epsilon)
         loss = tf.reduce_sum(weights * neglog_prec, 0)
         
         return loss

From 9dcde6a97da0266f1e31cf6be3035e68902b0051 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sat, 13 Oct 2018 16:21:00 -0400
Subject: [PATCH 21/65] fix checkpoint save and restore bug

---
 .../differentiable_expected_bleu.py                | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index b8dcaa8d..a3b071fa 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -238,12 +238,14 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
         sess.run(tf.global_variables_initializer())
         sess.run(tf.local_variables_initializer())
         sess.run(tf.tables_initializer())
-        ckpt_path = os.path.join(expr_name, 'ckpt')
-        ckpt_model = os.path.join(ckpt_path, 'model.ckpt')
-        ckpt_best = os.path.join(ckpt_path, 'best.ckpt')
-        if os.path.exists(ckpt_path) and tf.train.checkpoint_exists(ckpt_model):
-            print('restoring from {} ...'.format(ckpt_model))
-            optimistic_restore(sess, ckpt_model)
+        dir_model = os.path.join(expr_name, 'ckpt')
+        dir_best = os.path.join(expr_name, 'ckpt-best')
+        ckpt_model = os.path.join(dir_model, 'model.ckpt')
+        ckpt_best = os.path.join(dir_best, 'model.ckpt')
+        if os.path.exists(dir_model):
+            ckpt_path = tf.train.latest_checkpoint(dir_model)
+            print('restoring from {} ...'.format(ckpt_path))
+            optimistic_restore(sess, ckpt_path)
             print('done.')
 
         summary_writer = tf.summary.FileWriter(

From 038478e8f0829115581eca44ae00db929a839ace Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sat, 13 Oct 2018 17:25:56 -0400
Subject: [PATCH 22/65] refine trigger

---
 .../config_train.py                           | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 examples/differentiable_expected_bleu/config_train.py

diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py
new file mode 100644
index 00000000..fe175a22
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_train.py
@@ -0,0 +1,45 @@
+max_epochs = 1000
+steps_per_eval = 500
+tau = 1.
+infer_beam_width = 1
+infer_max_decoding_length = 50
+
+mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
+threshold_steps = 10000
+minimum_interval_steps = 10000
+
+train_xe = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+    },
+    "learning_rate_decay": {
+        "type": "piecewise_constant",
+        "kwargs": {
+            "boundaries": [160000],
+            "values": [1e-3, 1e-5],
+        },
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+}
+
+train_debleu = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": 1e-5,
+        }
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+}
+
+expr_name = 'train'

From 101d5a194bee1f43ef2844e71311f3f20e6ee14c Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sat, 13 Oct 2018 19:19:37 -0400
Subject: [PATCH 23/65] refine trigger

---
 .../differentiable_expected_bleu.py           |  2 +-
 .../differentiable_expected_bleu/triggers.py  | 28 +++++++++++--------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index a3b071fa..d5db0664 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -259,7 +259,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
             trigger = BestEverConvergenceTrigger(
                 action,
                 config_train.threshold_steps,
-                config_train.wait_steps,
+                config_train.minimum_interval_steps,
                 default=None)
 
         epoch = 0
diff --git a/examples/differentiable_expected_bleu/triggers.py b/examples/differentiable_expected_bleu/triggers.py
index 9b879048..65efe8ca 100644
--- a/examples/differentiable_expected_bleu/triggers.py
+++ b/examples/differentiable_expected_bleu/triggers.py
@@ -34,16 +34,16 @@ def __init__(self, action, default=DEFAULT):
         return result values. default is used as result value when action is
         exhausted.
         """
-        self._action = action
+        self._action = iter(action)
         self._default = default
 
-    def predicate(self, *args, **kwargs):
+    def _predicate(self, *args, **kwargs):
         """This function returns True when we think we should do something.
         """
         raise NotImplementedError
 
     def __call__(self, *args, **kwargs):
-        pred = self.predicate(*args, **kwargs)
+        pred = self._predicate(*args, **kwargs)
         if pred:
             ret = next(self._action) if self._default is DEFAULT else \
                   next(self._action, self._default)
@@ -64,7 +64,7 @@ def __init__(self, action, steps, default=DEFAULT):
     def _advance_steps(self):
         self._next_step = next(step, None)
 
-    def predicate(self, step):
+    def _predicate(self, step):
         while self._next_step is not None and step < self._next_step:
             self._advance_steps()
         if self._next_step is not None and step == self._next_step:
@@ -74,21 +74,23 @@ def predicate(self, step):
 
 class BestEverConvergenceTrigger(Trigger):
 
-    def __init__(self, action, threshold_steps, wait_steps, default=DEFAULT):
+    def __init__(self, action, threshold_steps, minimum_interval_steps,
+                 default=DEFAULT):
         super(BestEverConvergenceTrigger, self).__init__(action, default)
         self._threshold_steps = threshold_steps
-        self._wait_steps = wait_steps
+        self._minimum_interval_steps = minimum_interval_steps
         self._last_triggered_step = None
         self._best_ever_step = None
         self._best_ever_score = None
 
-    def predicate(self, step, score):
+    def _predicate(self, step, score):
         if self._best_ever_score is None or self._best_ever_score < score:
             self._best_ever_score = score
             self._best_ever_step = step
 
         if (self._last_triggered_step is None or
-                step - self._last_triggered_step >= self._wait_steps) and \
+                step - self._last_triggered_step >=
+                self._minimum_interval_steps) and \
                 step - self._best_ever_step >= self._threshold_steps:
             self._last_triggered_step = step
             return True
@@ -97,18 +99,19 @@ def predicate(self, step, score):
 
 class MovingAverageConvergenceTrigger(Trigger):
 
-    def __init__(self, action, n, threshold, wait_steps, default=DEFAULT):
+    def __init__(self, action, n, threshold, minimum_interval_steps,
+                 default=DEFAULT):
         super(MovingAverageConvergenceTrigger, self).__init__(action, default)
         self._n = n
         self._threshold = threshold
-        self._wait_steps = wait_steps
+        self._minimum_interval_steps = minimum_interval_steps
         self._last_triggered_step = None
         self._head_queue = queue.Queue(self._n)
         self._head_sum = 0
         self._rear_queue = queue.Queue(self._n)
         self._rear_sum = 0
 
-    def predicate(self, step, score):
+    def _predicate(self, step, score):
         if self._head_queue.full():
             e = self._head_queue.get()
             self._head_sum -= e
@@ -120,7 +123,8 @@ def predicate(self, step, score):
         self._head_sum += score
 
         if (self._last_triggered_step is None or
-                step - self._last_triggered_step >= self._wait_steps) and \
+                step - self._last_triggered_step
+                >= self._minimum_interval_steps) and \
                 self._head_queue.full() and self._rear_queue.full() and \
                 self._head_sum - self._rear_sum <= self._n * self._threshold:
             self._last_triggered_step = step

From b293bc17d5518d88f7f18aa1f0f3480861ecfcf1 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sat, 13 Oct 2018 21:23:09 -0400
Subject: [PATCH 24/65] add trigger save & restore (not tested yet)

---
 .../differentiable_expected_bleu.py           | 47 +++++++++++++-----
 .../differentiable_expected_bleu/triggers.py  | 48 +++++++++++++++++--
 2 files changed, 80 insertions(+), 15 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index d5db0664..10b86d22 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -235,33 +235,44 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
 
     best_val_bleu = -1
     with tf.Session() as sess:
+        if pretraining:
+            trigger = None
+        else:
+            action = (tm_helper.assign_mask_pattern(sess, n_unmask, n_mask)
+                      for n_unmask, n_mask in mask_patterns[1:])
+            trigger = BestEverConvergenceTrigger(
+                action,
+                config_train.threshold_steps,
+                config_train.minimum_interval_steps,
+                default=None)
+
         sess.run(tf.global_variables_initializer())
         sess.run(tf.local_variables_initializer())
         sess.run(tf.tables_initializer())
+
         dir_model = os.path.join(expr_name, 'ckpt')
         dir_best = os.path.join(expr_name, 'ckpt-best')
         ckpt_model = os.path.join(dir_model, 'model.ckpt')
         ckpt_best = os.path.join(dir_best, 'model.ckpt')
+
         if os.path.exists(dir_model):
             ckpt_path = tf.train.latest_checkpoint(dir_model)
             print('restoring from {} ...'.format(ckpt_path))
             optimistic_restore(sess, ckpt_path)
+
+            if trigger is not None:
+                trigger_path = '{}.trigger'.format(ckpt_path)
+                if os.path.exists(trigger_path):
+                    with open(trigger_path, 'r') as pickle_file:
+                        trigger.restore_from_pickle(pickle_file)
+                else:
+                    print('cannot find previous trigger state.')
+
             print('done.')
 
         summary_writer = tf.summary.FileWriter(
             os.path.join(expr_name, 'log'), sess.graph, flush_secs=30)
 
-        if pretraining:
-            trigger = None
-        else:
-            action = (tm_helper.assign_mask_pattern(sess, n_unmask, n_mask)
-                      for n_unmask, n_mask in mask_patterns[1:])
-            trigger = BestEverConvergenceTrigger(
-                action,
-                config_train.threshold_steps,
-                config_train.minimum_interval_steps,
-                default=None)
-
         epoch = 0
         while epoch < config_train.max_epochs:
             print('epoch #{}:'.format(epoch))
@@ -272,12 +283,24 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
                 print('epoch: {}, step: {}, best val bleu: {}'.format(
                     epoch, step, best_val_bleu))
                 saved_path = saver.save(
-                    sess, os.path.join(ckpt_best), global_step=step)
+                    sess, ckpt_best, global_step=step)
+
+                if trigger is not None:
+                    with open('{}.trigger'.format(ckpt_best), 'w') as \
+                            pickle_file:
+                        trigger.save_to_pickle(pickle_file)
+
                 print('saved to {}'.format(saved_path))
+
             _train_epoch(sess, summary_writer, train_op, trigger)
             epoch += 1
             step = tf.train.global_step(sess, global_step)
             saved_path = saver.save(sess, ckpt_model, global_step=step)
+
+            if trigger is not None:
+                with open('{}.trigger'.format(ckpt_model), 'w') as pickle_file:
+                    trigger.save_to_pickle(pickle_file)
+
             print('saved to {}'.format(saved_path))
 
 
diff --git a/examples/differentiable_expected_bleu/triggers.py b/examples/differentiable_expected_bleu/triggers.py
index 65efe8ca..63aad56c 100644
--- a/examples/differentiable_expected_bleu/triggers.py
+++ b/examples/differentiable_expected_bleu/triggers.py
@@ -18,13 +18,15 @@
 from __future__ import print_function
 from __future__ import division
 
-#pylint: disable=invalid-name, too-many-arguments, too-many-locals
+import pickle
 
 try:
     import queue
 except ImportError:
     import Queue as queue
 
+#pylint: disable=invalid-name, too-many-arguments, too-many-locals
+
 DEFAULT = object()
 
 class Trigger(object):
@@ -36,21 +38,50 @@ def __init__(self, action, default=DEFAULT):
         """
         self._action = iter(action)
         self._default = default
+        self._triggered_times = 0
 
     def _predicate(self, *args, **kwargs):
         """This function returns True when we think we should do something.
         """
         raise NotImplementedError
 
+    def _next_action(self):
+        return next(self._action) if self._default is DEFAULT else \
+               next(self._action, self._default)
+
     def __call__(self, *args, **kwargs):
         pred = self._predicate(*args, **kwargs)
         if pred:
-            ret = next(self._action) if self._default is DEFAULT else \
-                  next(self._action, self._default)
+            ret = self._next_action()
+            self._triggered_times += 1
         else:
             ret = None
         return pred, ret
 
+    def _make_state(self, names):
+        return {name: getattr(self, name) for name in names}
+
+    @property
+    def _state_names(self):
+        return ['_triggered_times']
+
+    @property
+    def state(self):
+        return self._make_state(self._state_names)
+
+    def restore_from_state(self, state):
+        for name, value in state.items():
+            setattr(self, name, value)
+
+        for t in range(self._triggered_times):
+            self._next_action()
+
+    def save_to_pickle(self, file):
+        pickle.dump(self.state, file)
+
+    def restore_from_pickle(self, file):
+        self.restore_from_state(pickle.load(file))
+
 
 class ScheduledStepsTrigger(Trigger):
     
@@ -96,6 +127,11 @@ def _predicate(self, step, score):
             return True
         return False
 
+    @property
+    def _state_names(self):
+        return super(BestEverConvergenceTrigger, self)._state_names + [
+            '_last_triggered_step', '_best_ever_step', '_best_ever_score']
+
 
 class MovingAverageConvergenceTrigger(Trigger):
 
@@ -130,3 +166,9 @@ def _predicate(self, step, score):
             self._last_triggered_step = step
             return True
         return False
+
+    @property
+    def _state_names(self):
+        return super(BestEverConvergenceTrigger, self)._state_names + [
+            '_last_triggered_step', '_head_queue', '_head_sum', '_rear_queue',
+            '_rear_sum']

From 9b2b38212cbe5a2ec521dbcfcddefe07d64a0177 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sat, 13 Oct 2018 21:34:37 -0400
Subject: [PATCH 25/65] move module triggers into texar/utils

---
 .../differentiable_expected_bleu.py                  |  3 +--
 texar/utils/__init__.py                              |  1 +
 .../utils}/triggers.py                               | 12 ++++++------
 3 files changed, 8 insertions(+), 8 deletions(-)
 rename {examples/differentiable_expected_bleu => texar/utils}/triggers.py (94%)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 10b86d22..b578d2e7 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -24,7 +24,6 @@
 import os
 import tensorflow as tf
 import texar as tx
-from triggers import BestEverConvergenceTrigger
 
 flags = tf.flags
 
@@ -240,7 +239,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
         else:
             action = (tm_helper.assign_mask_pattern(sess, n_unmask, n_mask)
                       for n_unmask, n_mask in mask_patterns[1:])
-            trigger = BestEverConvergenceTrigger(
+            trigger = tx.utils.BestEverConvergenceTrigger(
                 action,
                 config_train.threshold_steps,
                 config_train.minimum_interval_steps,
diff --git a/texar/utils/__init__.py b/texar/utils/__init__.py
index d22e2050..ab284e9c 100644
--- a/texar/utils/__init__.py
+++ b/texar/utils/__init__.py
@@ -29,3 +29,4 @@
 from texar.utils.mode import *
 from texar.utils.average_recorder import *
 from texar.utils.utils_io import *
+from texar.utils.triggers import *
diff --git a/examples/differentiable_expected_bleu/triggers.py b/texar/utils/triggers.py
similarity index 94%
rename from examples/differentiable_expected_bleu/triggers.py
rename to texar/utils/triggers.py
index 63aad56c..efa339e0 100644
--- a/examples/differentiable_expected_bleu/triggers.py
+++ b/texar/utils/triggers.py
@@ -27,11 +27,11 @@
 
 #pylint: disable=invalid-name, too-many-arguments, too-many-locals
 
-DEFAULT = object()
+DEFAULT_ACTION = object()
 
 class Trigger(object):
 
-    def __init__(self, action, default=DEFAULT):
+    def __init__(self, action, default=DEFAULT_ACTION):
         """action is an iterator that iteratively do a sequence of action and
         return result values. default is used as result value when action is
         exhausted.
@@ -46,7 +46,7 @@ def _predicate(self, *args, **kwargs):
         raise NotImplementedError
 
     def _next_action(self):
-        return next(self._action) if self._default is DEFAULT else \
+        return next(self._action) if self._default is DEFAULT_ACTION else \
                next(self._action, self._default)
 
     def __call__(self, *args, **kwargs):
@@ -85,7 +85,7 @@ def restore_from_pickle(self, file):
 
 class ScheduledStepsTrigger(Trigger):
     
-    def __init__(self, action, steps, default=DEFAULT):
+    def __init__(self, action, steps, default=DEFAULT_ACTION):
         """steps should be in increasing order.
         """
         super(ScheduledTrigger, self).__init__(action, default)
@@ -106,7 +106,7 @@ def _predicate(self, step):
 class BestEverConvergenceTrigger(Trigger):
 
     def __init__(self, action, threshold_steps, minimum_interval_steps,
-                 default=DEFAULT):
+                 default=DEFAULT_ACTION):
         super(BestEverConvergenceTrigger, self).__init__(action, default)
         self._threshold_steps = threshold_steps
         self._minimum_interval_steps = minimum_interval_steps
@@ -136,7 +136,7 @@ def _state_names(self):
 class MovingAverageConvergenceTrigger(Trigger):
 
     def __init__(self, action, n, threshold, minimum_interval_steps,
-                 default=DEFAULT):
+                 default=DEFAULT_ACTION):
         super(MovingAverageConvergenceTrigger, self).__init__(action, default)
         self._n = n
         self._threshold = threshold

From 190d5b3beb4af581e56d3dc7c60b47ad1403b345 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sat, 13 Oct 2018 22:54:19 -0400
Subject: [PATCH 26/65] refine codes

---
 .../differentiable_expected_bleu.py           | 140 ++++++++----------
 texar/utils/triggers.py                       |   1 +
 2 files changed, 65 insertions(+), 76 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index b578d2e7..782846a7 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -30,54 +30,18 @@
 flags.DEFINE_string("config_train", "config_train", "The training config.")
 flags.DEFINE_string("config_model", "config_model", "The model config.")
 flags.DEFINE_string("config_data", "config_iwslt14", "The dataset config.")
-flags.DEFINE_boolean("pretraining", False, "Whether pretraining.")
-flags.DEFINE_boolean("restore_adam", False, "Whether to restore Adam states.")
-flags.DEFINE_boolean("restore_mask", False, "Whether to restore mask patterns.")
+flags.DEFINE_integer("pretrain_epochs", 8, "Number of pretraining epochs.")
 
 FLAGS = flags.FLAGS
 
 config_train = importlib.import_module(FLAGS.config_train)
 config_model = importlib.import_module(FLAGS.config_model)
 config_data = importlib.import_module(FLAGS.config_data)
-pretraining = FLAGS.pretraining
-restore_adam = FLAGS.restore_adam
-restore_mask = FLAGS.restore_mask
+pretrain_epochs = FLAGS.pretrain_epochs
 
 expr_name = config_train.expr_name
 mask_patterns = config_train.mask_patterns
 
-def optimistic_restore(session, save_file, graph=tf.get_default_graph()):
-    reader = tf.train.NewCheckpointReader(save_file)
-    saved_shapes = reader.get_variable_to_shape_map()
-    var_names = sorted([
-        (var.name, var.name.split(':')[0]) for var in tf.global_variables()
-        if var.name.split(':')[0] in saved_shapes])
-    restore_vars = []
-    for var_name, saved_var_name in var_names:
-        curr_var = graph.get_tensor_by_name(var_name)
-        var_shape = curr_var.get_shape().as_list()
-        if var_shape == saved_shapes[saved_var_name]:
-            restore_vars.append(curr_var)
-    if not restore_adam:
-        restore_vars = list(filter(
-            lambda var: var.name.split(':')[0].split('/')[0] != 'OptimizeLoss',
-            restore_vars))
-    if not restore_mask:
-        restore_vars = list(filter(
-            lambda var: var.name.split(':')[0].split('/')[0] not in \
-                ['n_unmask', 'n_mask'],
-            restore_vars))
-    print('restoring variables:\n{}'.format('\n'.join(
-        var.name for var in restore_vars)))
-    opt_saver = tf.train.Saver(restore_vars)
-    opt_saver.restore(session, save_file)
-
-def get_data_loader(sess, fetches, feed_dict):
-    while True:
-        try:
-            yield sess.run(fetches, feed_dict=feed_dict)
-        except tf.errors.OutOfRangeError:
-            break
 
 def build_model(batch, train_data):
     """Assembles the seq2seq model.
@@ -110,6 +74,10 @@ def build_model(batch, train_data):
         logits=tf_outputs.logits,
         sequence_length=batch['target_length']-1)
 
+    #TODO: find a way to reset Adam state at the lr decay point
+    #restore_vars = list(filter(
+    #    lambda var: var.name.split(':')[0].split('/')[0] != 'OptimizeLoss',
+    #    restore_vars))
     train_xe_op = tx.core.get_train_op(
         loss_xe,
         hparams=config_train.train_xe)
@@ -167,7 +135,6 @@ def main():
 
     train_xe_op, train_debleu_op, tm_helper, infer_outputs = \
         build_model(data_batch, train_data)
-    train_op = train_xe_op if pretraining else train_debleu_op
 
     tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask)
     tf.summary.scalar('tm/n_mask', tm_helper.n_mask)
@@ -178,22 +145,31 @@ def main():
 
     def _train_epoch(sess, summary_writer, train_op, trigger):
         print('in _train_epoch')
+
         data_iterator.restart_dataset(sess, 'train')
         feed_dict = {
             tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
             data_iterator.handle: data_iterator.get_handle(sess, 'train')
         }
 
-        for loss, summary, step in get_data_loader(
-                sess, (train_op, merged_summary, global_step), feed_dict):
-            summary_writer.add_summary(summary, step)
-            if step % config_train.steps_per_eval == 0:
-                _eval_epoch(sess, summary_writer, 'val', trigger)
+        while True:
+            try:
+                loss, summary, step = sess.run(
+                    (train_op, merged_summary, global_step), feed_dict)
+
+                summary_writer.add_summary(summary, step)
+
+                if step % config_train.steps_per_eval == 0:
+                    _eval_epoch(sess, summary_writer, 'val', trigger)
+
+            except tf.errors.OutOfRangeError:
+                break
 
         print('end _train_epoch')
 
     def _eval_epoch(sess, summary_writer, mode, trigger):
         print('in _eval_epoch with mode {}'.format(mode))
+
         data_iterator.restart_dataset(sess, mode)
         feed_dict = {
             tx.global_mode(): tf.estimator.ModeKeys.EVAL,
@@ -205,20 +181,27 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
             data_batch['target_text'][:, 1:],
             infer_outputs.predicted_ids[:, :, 0]
         ]
-        for target_texts_ori, output_ids in \
-                get_data_loader(sess, fetches, feed_dict):
-            target_texts = tx.utils.strip_special_tokens(target_texts_ori)
-            output_texts = tx.utils.map_ids_to_strs(
-                ids=output_ids, vocab=val_data.target_vocab)
 
-            ref_hypo_pairs.extend(
-                zip(map(lambda x: [x], target_texts), output_texts))
+        while True:
+            try:
+                target_texts_ori, output_ids = sess.run(fetches, feed_dict)
+                target_texts = tx.utils.strip_special_tokens(target_texts_ori)
+                output_texts = tx.utils.map_ids_to_strs(
+                    ids=output_ids, vocab=val_data.target_vocab)
+
+                ref_hypo_pairs.extend(
+                    zip(map(lambda x: [x], target_texts), output_texts))
+
+            except tf.errors.OutOfRangeError:
+                break
 
         refs, hypos = zip(*ref_hypo_pairs)
         bleu = tx.evals.corpus_bleu_moses(list_of_references=refs,
                                           hypotheses=hypos)
+        print('{} BLEU: {}'.format(mode, bleu))
 
         step = tf.train.global_step(sess, global_step)
+
         summary = tf.Summary()
         summary.value.add(tag='{}/BLEU'.format(mode), simple_value=bleu)
         summary_writer.add_summary(summary, step)
@@ -234,16 +217,13 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
 
     best_val_bleu = -1
     with tf.Session() as sess:
-        if pretraining:
-            trigger = None
-        else:
-            action = (tm_helper.assign_mask_pattern(sess, n_unmask, n_mask)
-                      for n_unmask, n_mask in mask_patterns[1:])
-            trigger = tx.utils.BestEverConvergenceTrigger(
-                action,
-                config_train.threshold_steps,
-                config_train.minimum_interval_steps,
-                default=None)
+        action = (tm_helper.assign_mask_pattern(sess, n_unmask, n_mask)
+                  for n_unmask, n_mask in mask_patterns[1:])
+        trigger = tx.utils.BestEverConvergenceTrigger(
+            action,
+            config_train.threshold_steps,
+            config_train.minimum_interval_steps,
+            default=None)
 
         sess.run(tf.global_variables_initializer())
         sess.run(tf.local_variables_initializer())
@@ -257,15 +237,14 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
         if os.path.exists(dir_model):
             ckpt_path = tf.train.latest_checkpoint(dir_model)
             print('restoring from {} ...'.format(ckpt_path))
-            optimistic_restore(sess, ckpt_path)
+            saver.restore(sess, ckpt_path)
 
-            if trigger is not None:
-                trigger_path = '{}.trigger'.format(ckpt_path)
-                if os.path.exists(trigger_path):
-                    with open(trigger_path, 'r') as pickle_file:
-                        trigger.restore_from_pickle(pickle_file)
-                else:
-                    print('cannot find previous trigger state.')
+            trigger_path = '{}.trigger'.format(ckpt_path)
+            if os.path.exists(trigger_path):
+                with open(trigger_path, 'r') as pickle_file:
+                    trigger.restore_from_pickle(pickle_file)
+            else:
+                print('cannot find previous trigger state.')
 
             print('done.')
 
@@ -274,29 +253,38 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
 
         epoch = 0
         while epoch < config_train.max_epochs:
-            print('epoch #{}:'.format(epoch))
+            pretraining = epoch < pretrain_epochs
+            print('epoch #{}{}:'.format(
+                epoch, ' (pretraining)' if pretraining else ''))
+
             val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger)
+            step = tf.train.global_step(sess, global_step)
+            print('epoch: {}, step: {}, val bleu: {}'.format(
+                epoch, step, val_bleu))
+
             if val_bleu > best_val_bleu:
                 best_val_bleu = val_bleu
-                step = tf.train.global_step(sess, global_step)
-                print('epoch: {}, step: {}, best val bleu: {}'.format(
-                    epoch, step, best_val_bleu))
+                print('update best val bleu: {}'.format(best_val_bleu))
+
                 saved_path = saver.save(
                     sess, ckpt_best, global_step=step)
 
-                if trigger is not None:
+                if not pretraining:
                     with open('{}.trigger'.format(ckpt_best), 'w') as \
                             pickle_file:
                         trigger.save_to_pickle(pickle_file)
 
                 print('saved to {}'.format(saved_path))
 
-            _train_epoch(sess, summary_writer, train_op, trigger)
+            train_op = train_xe_op if pretraining else train_debleu_op
+            _train_epoch(sess, summary_writer, train_op,
+                         None if pretraining else trigger)
             epoch += 1
+
             step = tf.train.global_step(sess, global_step)
             saved_path = saver.save(sess, ckpt_model, global_step=step)
 
-            if trigger is not None:
+            if not pretraining:
                 with open('{}.trigger'.format(ckpt_model), 'w') as pickle_file:
                     trigger.save_to_pickle(pickle_file)
 
diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py
index efa339e0..af814029 100644
--- a/texar/utils/triggers.py
+++ b/texar/utils/triggers.py
@@ -29,6 +29,7 @@
 
 DEFAULT_ACTION = object()
 
+
 class Trigger(object):
 
     def __init__(self, action, default=DEFAULT_ACTION):

From a4fdd5a7b78a765e4c8f00d66c569d981350f440 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sat, 13 Oct 2018 23:30:31 -0400
Subject: [PATCH 27/65] add comments to debleu.py

---
 texar/losses/debleu.py | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/texar/losses/debleu.py b/texar/losses/debleu.py
index eeb9ba04..51dffe9c 100644
--- a/texar/losses/debleu.py
+++ b/texar/losses/debleu.py
@@ -144,23 +144,25 @@ def debleu(labels, probs, sequence_length, time_major=False,
 
     """ # TODO: rewrite example
     with tf.name_scope(name, "debleu"):
-        X = probs
-        Y = labels
+        X = probs   # p_theta(y)
+        Y = labels  # y*
 
         if time_major:
             X = tf.transpose(X, [1, 0, 2])
             Y = tf.transpose(Y, [1, 0])
         
-        sizeX = tf.shape(X)[1]
-        sizeY = tf.shape(Y)[1]
+        T_X = tf.shape(X)[1] # max T
+        T_Y = tf.shape(Y)[1] # max T*
 
-        XY = batch_gather(X, tf.tile(tf.expand_dims(Y, 1), [1, sizeX, 1]))
+        # XY denotes p(y_i=y*_j)
+        XY = batch_gather(X, tf.tile(tf.expand_dims(Y, 1), [1, T_X, 1]))
+        # YY denotes 1(y*_j=y*_j')
         YY = tf.to_float(tf.equal(tf.expand_dims(Y, 2), tf.expand_dims(Y, 1)))
 
         maskX = tf.sequence_mask(
-            sequence_length + 1, maxlen=sizeX + 1, dtype=tf.float32)
+            sequence_length + 1, maxlen=T_X + 1, dtype=tf.float32)
         maskY = tf.sequence_mask(
-            sequence_length + 1, maxlen=sizeY + 1, dtype=tf.float32)
+            sequence_length + 1, maxlen=T_Y + 1, dtype=tf.float32)
         matchXY = tf.expand_dims(maskX, 2) * tf.expand_dims(maskY, 1)
         matchYY = tf.minimum(tf.expand_dims(maskY, 2),
                              tf.expand_dims(maskY, 1))
@@ -168,26 +170,29 @@ def debleu(labels, probs, sequence_length, time_major=False,
         tot = []
         o = []
 
-        for order in range(max_order):
-            matchXY = XY[:, : sizeX - order, : sizeY - order] * \
-                      matchXY[:, 1:, 1:]
-            matchYY = YY[:, : sizeY - order, : sizeY - order] * \
-                      matchYY[:, 1:, 1:]
+        for order in range(max_order): # order = n - 1
+            # Eq.20
+            matchXY = XY[:, : T_X - order, : T_Y - order] * matchXY[:, 1:, 1:]
+            matchYY = YY[:, : T_Y - order, : T_Y - order] * matchYY[:, 1:, 1:]
             cntYX = tf.reduce_sum(matchXY, 1, keepdims=True)
             cntYY = tf.reduce_sum(matchYY, 1, keepdims=True)
+            # Eq.14
             o_order = tf.reduce_sum(tf.reduce_sum(
                 min_fn(cntYY / (cntYX - matchXY + 1))
                 * matchXY / tf.maximum(1., cntYY),
                 2), 1)
-            # in order to avoid being divided by 0
+            # calculate (T - n + 1); max(1, .) is to avoid being divided by 0
             tot_order = tf.maximum(1, sequence_length - order)
             tot.append(tot_order)
             o.append(o_order)
 
         tot = tf.stack(tot, 1)
         o = tf.stack(o, 1)
+        # Eq.15
         prec = tf.reduce_sum(o, 0) / tf.to_float(tf.reduce_sum(tot, 0))
+        # add epsilon in order to avoid inf gradient
         neglog_prec = -tf.log(prec + epsilon)
+        # Eq.17; constant about BP is omitted
         loss = tf.reduce_sum(weights * neglog_prec, 0)
         
         return loss

From 77c0a52bf1988f14ed7e180ea5c9f289fea5e156 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sun, 14 Oct 2018 01:35:09 -0400
Subject: [PATCH 28/65] add name_scope to TeacherMaskSoftmaxEmbeddingHelper

---
 texar/modules/decoders/rnn_decoder_helpers.py | 53 ++++++++++---------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py
index c12447d7..99885440 100644
--- a/texar/modules/decoders/rnn_decoder_helpers.py
+++ b/texar/modules/decoders/rnn_decoder_helpers.py
@@ -333,30 +333,35 @@ def sample(self, time, outputs, state, name=None):
 class TeacherMaskSoftmaxEmbeddingHelper(TFTrainingHelper):
     def __init__(self, inputs, sequence_length, embedding, n_unmask,
                  n_mask, tau=1., time_major=False, seed=None,
-                 stop_gradient=False):
-        super(TeacherMaskSoftmaxEmbeddingHelper, self).__init__(
-            inputs=inputs,
-            sequence_length=sequence_length,
-            time_major=time_major)
-
-        self._embedding, self._embedding_fn = get_embedding_and_fn(embedding)
-        self._tau = tau
-        self._seed = seed
-        self._stop_gradient = stop_gradient
-
-        self._zero_next_inputs = tf.zeros_like(
-            self._embedding_fn(self._zero_inputs))
-
-        self._n_unmask = tf.Variable(n_unmask, name='n_unmask')
-        self._n_mask = tf.Variable(n_mask, name='n_mask')
-        self._n_cycle = tf.add(self._n_unmask, self._n_mask, name='n_cycle')
-        self._new_n_unmask = tf.placeholder(shape=[], dtype=tf.int32)
-        self._new_n_mask = tf.placeholder(shape=[], dtype=tf.int32)
-        self._assign_n_unmask = tf.assign(self._n_unmask, self._new_n_unmask)
-        self._assign_n_mask = tf.assign(self._n_mask, self._new_n_mask)
-        self._n_shift = tf.random_uniform(
-            [], maxval=self._n_cycle, dtype=self._n_cycle.dtype,
-            seed=self._seed, name='n_shift')
+                 stop_gradient=False, name=None):
+        with tf.name_scope(name, "TeacherMaskSoftmaxEmbeddingHelper",
+                           [embedding, tau, seed, stop_gradient]):
+            super(TeacherMaskSoftmaxEmbeddingHelper, self).__init__(
+                inputs=inputs,
+                sequence_length=sequence_length,
+                time_major=time_major)
+
+            self._embedding, self._embedding_fn = get_embedding_and_fn(
+                embedding)
+            self._tau = tau
+            self._seed = seed
+            self._stop_gradient = stop_gradient
+
+            self._zero_next_inputs = tf.zeros_like(
+                self._embedding_fn(self._zero_inputs))
+
+            self._n_unmask = tf.Variable(n_unmask, name='n_unmask')
+            self._n_mask = tf.Variable(n_mask, name='n_mask')
+            self._n_cycle = tf.add(
+                self._n_unmask, self._n_mask, name='n_cycle')
+            self._new_n_unmask = tf.placeholder(shape=[], dtype=tf.int32)
+            self._new_n_mask = tf.placeholder(shape=[], dtype=tf.int32)
+            self._assign_n_unmask = tf.assign(
+                self._n_unmask, self._new_n_unmask)
+            self._assign_n_mask = tf.assign(self._n_mask, self._new_n_mask)
+            self._n_shift = tf.random_uniform(
+                [], maxval=self._n_cycle, dtype=self._n_cycle.dtype,
+                seed=self._seed, name='n_shift')
 
     @property
     def sample_ids_dtype(self):

From c70b8e256134537cc5fdbf280a9ca28fafa92d93 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sun, 14 Oct 2018 01:44:01 -0400
Subject: [PATCH 29/65] fix lr decay boundaries

---
 examples/differentiable_expected_bleu/config_train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py
index fe175a22..ccdde330 100644
--- a/examples/differentiable_expected_bleu/config_train.py
+++ b/examples/differentiable_expected_bleu/config_train.py
@@ -15,7 +15,7 @@
     "learning_rate_decay": {
         "type": "piecewise_constant",
         "kwargs": {
-            "boundaries": [160000],
+            "boundaries": [10000],
             "values": [1e-3, 1e-5],
         },
     },

From 6daaac8ed89929ec56e15945cd571d9004f9ac68 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sun, 14 Oct 2018 13:32:53 -0400
Subject: [PATCH 30/65] fix save trigger path

---
 .../differentiable_expected_bleu.py                           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 782846a7..86eec55f 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -270,7 +270,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
                     sess, ckpt_best, global_step=step)
 
                 if not pretraining:
-                    with open('{}.trigger'.format(ckpt_best), 'w') as \
+                    with open('{}.trigger'.format(saved_path), 'w') as \
                             pickle_file:
                         trigger.save_to_pickle(pickle_file)
 
@@ -285,7 +285,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
             saved_path = saver.save(sess, ckpt_model, global_step=step)
 
             if not pretraining:
-                with open('{}.trigger'.format(ckpt_model), 'w') as pickle_file:
+                with open('{}.trigger'.format(saved_path), 'w') as pickle_file:
                     trigger.save_to_pickle(pickle_file)
 
             print('saved to {}'.format(saved_path))

From afacfe948f9b1999b678355f3fb61efcf27f3a9f Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Sun, 14 Oct 2018 16:48:01 -0400
Subject: [PATCH 31/65] add docs

---
 docs/code/losses.rst                          |  8 +++
 docs/code/modules.rst                         |  5 ++
 docs/code/utils.rst                           | 13 ++++
 .../differentiable_expected_bleu.py           |  4 --
 texar/losses/debleu.py                        | 56 ++++++++++------
 texar/modules/decoders/rnn_decoder_helpers.py | 65 ++++++++++++++++++-
 texar/utils/triggers.py                       | 32 +++++++++
 7 files changed, 158 insertions(+), 25 deletions(-)

diff --git a/docs/code/losses.rst b/docs/code/losses.rst
index df1a14a3..87a0c6b0 100644
--- a/docs/code/losses.rst
+++ b/docs/code/losses.rst
@@ -68,6 +68,14 @@ Entropy
 .. autofunction:: texar.losses.sequence_entropy_with_logits
 
 
+DEBLEU
+==================
+
+:hidden:`debleu`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autofunction:: texar.losses.debleu
+
+
 Loss Utils
 ===========
 
diff --git a/docs/code/modules.rst b/docs/code/modules.rst
index d3d6443e..5aac39c0 100644
--- a/docs/code/modules.rst
+++ b/docs/code/modules.rst
@@ -134,6 +134,11 @@ Decoders
 .. autoclass:: texar.modules.GumbelSoftmaxEmbeddingHelper
     :members:
 
+:hidden:`TeacherMaskSoftmaxEmbeddingHelper`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: texar.modules.TeacherMaskSoftmaxEmbeddingHelper
+    :members:
+
 :hidden:`get_helper`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autofunction:: texar.modules.get_helper
diff --git a/docs/code/utils.rst b/docs/code/utils.rst
index 5c113c1a..c463c752 100644
--- a/docs/code/utils.rst
+++ b/docs/code/utils.rst
@@ -278,3 +278,16 @@ AverageRecorder
 ==========================
 .. autoclass:: texar.utils.AverageRecorder
     :members:
+
+Trigger
+==========================
+
+:hidden:`Trigger`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: texar.utils.Trigger
+    :members:
+
+:hidden:`BestEverConvergenceTrigger`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: texar.utils.BestEverConvergenceTrigger
+    :members:
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 86eec55f..ee12b3dc 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -74,10 +74,6 @@ def build_model(batch, train_data):
         logits=tf_outputs.logits,
         sequence_length=batch['target_length']-1)
 
-    #TODO: find a way to reset Adam state at the lr decay point
-    #restore_vars = list(filter(
-    #    lambda var: var.name.split(':')[0].split('/')[0] != 'OptimizeLoss',
-    #    restore_vars))
     train_xe_op = tx.core.get_train_op(
         loss_xe,
         hparams=config_train.train_xe)
diff --git a/texar/losses/debleu.py b/texar/losses/debleu.py
index 51dffe9c..1d1307db 100644
--- a/texar/losses/debleu.py
+++ b/texar/losses/debleu.py
@@ -99,32 +99,42 @@ def batch_gather(params, indices, name=None):
 def debleu(labels, probs, sequence_length, time_major=False,
            min_fn=lambda x: tf.minimum(1., x), max_order=4,
            weights=[.1, .3, .3, .3], epsilon=1e-9, name=None):
-    """Computes sparse softmax cross entropy for each time step of sequence
-    predictions.
+    """Computes Differentiable Expected BLEU (DEBLEU). See
+    https://openreview.net/pdf?id=S1x2aiRqFX for details.
 
     Args:
-        labels: Target class indexes. I.e., classes are mutually exclusive
-            (each entry is in exactly one class).
+        labels: Target sequence token indexes, i.e. y* in the paper.
 
             - If :attr:`time_major` is `False` (default), this must be\
-            a Tensor of shape `[batch_size, max_time]`.
+            a tensor of shape `[batch_size, max_time]`.
 
-            - If `time_major` is `True`, this must be a Tensor of shape\
+            - If `time_major` is `True`, this must be a tensor of shape\
             `[max_time, batch_size].`
-        logits: Unscaled log probabilities. This must have the shape of
-            `[max_time, batch_size, num_classes]` or
-            `[batch_size, max_time, num_classes]` according to
+        probs: Probabilities generated by model, i.e. y in the paper. This must
+            have the shape of
+            `[max_time, batch_size, vocab_size]` or
+            `[batch_size, max_time, vocab_size]` according to
             the value of `time_major`.
-        sequence_length: A Tensor of shape `[batch_size]`. Time steps beyond
+        sequence_length: A tensor of shape `[batch_size]`. Time steps beyond
             the respective sequence lengths will have zero losses.
         time_major (bool): The shape format of the inputs. If `True`,
-            :attr:`labels` and :attr:`logits` must have shape
+            :attr:`labels` and :attr:`probs` must have shape
             `[max_time, batch_size, ...]`. If `False`
             (default), they must have shape `[batch_size, max_time, ...]`.
+        min_fn (function, optional): A python function that implements the min
+            operation in Eq.14 in the paper. Default to tf.minimum(1., x).
+        max_order (int, optional): Maximum order of grams calculated. Default
+            to 4.
+        weights (optional): A tensor (or simply Python list) of shape
+            `[max_order]` of which the i-th scalar is the weight of (i+1) gram
+            precision. Default to `[0.1, 0.3, 0.3, 0.3]`.
+        epsilon (float, optional): A small value added before applying
+            logarithm in Eq.17 in the paper. This is in order to avoid infinite
+            gradients. Default to 1e-9.
         name (str, optional): A name for the operation.
 
     Returns:
-        A Tensor containing the loss of rank 0.
+        A tensor containing the loss of rank 0.
 
     Example:
 
@@ -132,17 +142,23 @@ def debleu(labels, probs, sequence_length, time_major=False,
 
             embedder = WordEmbedder(vocab_size=data.vocab.size)
             decoder = BasicRNNDecoder(vocab_size=data.vocab.size)
-            outputs, _, _ = decoder(
-                decoding_strategy='train_greedy',
-                inputs=embedder(data_batch['text_ids']),
-                sequence_length=data_batch['length']-1)
-
-            loss = sequence_sparse_softmax_cross_entropy(
+            
+            tm_helper = texar.modules.TeacherMaskSoftmaxEmbeddingHelper(
+                inputs=data_batch['text_ids'],
+                sequence_length=data_batch['length']-1,
+                embedding=embedder,
+                n_unmask=1,
+                n_mask=0,
+                tau=1.)
+
+            outputs, _, _ = decoder(helper=tm_helper)
+
+            loss = debleu(
                 labels=data_batch['text_ids'][:, 1:],
-                logits=outputs.logits,
+                probs=outputs.sample_ids,
                 sequence_length=data_batch['length']-1)
 
-    """ # TODO: rewrite example
+    """
     with tf.name_scope(name, "debleu"):
         X = probs   # p_theta(y)
         Y = labels  # y*
diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py
index 99885440..f8e7040e 100644
--- a/texar/modules/decoders/rnn_decoder_helpers.py
+++ b/texar/modules/decoders/rnn_decoder_helpers.py
@@ -331,6 +331,66 @@ def sample(self, time, outputs, state, name=None):
 
 
 class TeacherMaskSoftmaxEmbeddingHelper(TFTrainingHelper):
+    """A helper that implements the Teacher Mask described in the paper
+    https://openreview.net/pdf?id=S1x2aiRqFX. In an unmasked step, it feeds
+    softmax probabilities over vocabulary to the next step. In a masked step,
+    it feeds the one-hot distribution of the target labels (:attr:`inputs`)
+    to the next step.
+    Uses the softmax probability or one-hot vector to pass through word
+    embeddings to get the next input (i.e., a mixed word embedding).
+    In this implementation, all sequences in a batch shares the same teacher
+    mask.
+
+    A subclass of
+    :tf_main:`TrainingHelper <contrib/seq2seq/TrainingHelper>`.
+    Used as a helper to :class:`~texar.modules.RNNDecoderBase` :meth:`_build`
+    in training mode.
+
+    Args:
+        inputs (2D Tensor): Target sequence token indexes. It should be a tensor
+            of shape `[batch_size, max_time]`. Must append both BOS and EOS
+            tokens to each sequence.
+        sequence_length (1D Tensor): Lengths of input token sequences. These
+            lengths should include the BOS tokens but exclude the EOS tokens.
+        embedding: An embedding argument (:attr:`params`) for
+            :tf_main:`tf.nn.embedding_lookup <nn/embedding_lookup>`, or an
+            instance of subclass of :class:`texar.modules.EmbedderBase`.
+            Note that other callables are not acceptable here.
+        n_unmask: An int scalar tensor denotes the mask pattern together with
+            :attr:`n_mask`. See the paper for details.
+        n_mask: An int scalar tensor denotes the mask pattern together with
+            :attr:`n_unmask`. See the paper for details.
+        tau (float, optional): A float scalar tensor, the softmax temperature.
+            Default to 1. 
+        seed (int, optional): The random seed used to shift the mask.
+        stop_gradient (bool): Whether to stop the gradient backpropagation
+            when feeding softmax vector to the next step.
+        name (str, optional): A name for the module.
+
+    Example:
+
+        .. code-block:: python
+
+            embedder = WordEmbedder(vocab_size=data.vocab.size)
+            decoder = BasicRNNDecoder(vocab_size=data.vocab.size)
+            
+            tm_helper = texar.modules.TeacherMaskSoftmaxEmbeddingHelper(
+                inputs=data_batch['text_ids'],
+                sequence_length=data_batch['length']-1,
+                embedding=embedder,
+                n_unmask=1,
+                n_mask=0,
+                tau=1.)
+
+            outputs, _, _ = decoder(helper=tm_helper)
+
+            loss = debleu(
+                labels=data_batch['text_ids'][:, 1:],
+                probs=outputs.sample_ids,
+                sequence_length=data_batch['length']-1)
+
+    """
+
     def __init__(self, inputs, sequence_length, embedding, n_unmask,
                  n_mask, tau=1., time_major=False, seed=None,
                  stop_gradient=False, name=None):
@@ -397,7 +457,10 @@ def initialize(self, name=None):
         return (finished, next_inputs)
 
     def sample(self, time, outputs, state, name=None):
-        """Returns `sample_id` of shape `[batch_size, vocab_size]`.
+        """Returns `sample_id` of shape `[batch_size, vocab_size]`. In an
+        unmasked step, it is softmax distributions over vocabulary with
+        temperature :attr:`tau`; in a masked step, it is one-hot
+        representations of :attr:`input` in the next step.
         """
         next_time = time + 1
         sample_ids = tf.cond(
diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py
index af814029..e4f8f967 100644
--- a/texar/utils/triggers.py
+++ b/texar/utils/triggers.py
@@ -27,10 +27,30 @@
 
 #pylint: disable=invalid-name, too-many-arguments, too-many-locals
 
+__all__ = [
+    "Trigger",
+    "BestEverConvergenceTrigger",
+]
+
+
 DEFAULT_ACTION = object()
 
 
 class Trigger(object):
+    """A trigger can do some action when certain condition is met.
+    Specifically, the user calls the trigger periodically. Every time the
+    trigger is called, it will send all arguments to :meth:`_predicate`, which
+    returns a boolean value indicates whether the condition is met. Once the
+    condition is met, the trigger will then call `next(action)` to do next
+    action and obtain the returned value.
+
+    Args:
+        action (iterable): An iterable which does the action and possibly
+            returns a value.
+        default: The value returned after :attr:`action` stops iteration. If
+            not provided, the trigger will do nothing when StopIteration
+            occurs.
+    """
 
     def __init__(self, action, default=DEFAULT_ACTION):
         """action is an iterator that iteratively do a sequence of action and
@@ -68,9 +88,21 @@ def _state_names(self):
 
     @property
     def state(self):
+        """The current state which can be used to save and restore the trigger.
+        The state records how many times `next(action)` has been called.
+        """
         return self._make_state(self._state_names)
 
     def restore_from_state(self, state):
+        """Restore the trigger state from the previous stored state.
+        Note that this function will call `next(action)` for the exact times
+        that the :py:attr:`state` records how many times `next(action)` had
+        been called. The user should be aware of any possible side effect of
+        this behavior.
+
+        Args:
+            state: The state previously obtained by :py:attr:`state`.
+        """
         for name, value in state.items():
             setattr(self, name, value)
 

From 0794ddcbbd9b8e631c1624b596a5b833d3b26b4f Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Sun, 14 Oct 2018 21:17:45 -0400
Subject: [PATCH 32/65] add more trigger docs

---
 texar/utils/triggers.py | 82 +++++++++++++++++++++++++++++++++--------
 1 file changed, 67 insertions(+), 15 deletions(-)

diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py
index e4f8f967..ce3c7183 100644
--- a/texar/utils/triggers.py
+++ b/texar/utils/triggers.py
@@ -37,32 +37,29 @@
 
 
 class Trigger(object):
-    """A trigger can do some action when certain condition is met.
-    Specifically, the user calls the trigger periodically. Every time the
-    trigger is called, it will send all arguments to :meth:`_predicate`, which
-    returns a boolean value indicates whether the condition is met. Once the
-    condition is met, the trigger will then call `next(action)` to do next
-    action and obtain the returned value.
+    """This is the base class of all triggers. A trigger can do some action when
+    certain condition is met. Specifically, the user calls the trigger
+    periodically. Every time the trigger is called, it will send all arguments
+    to :meth:`_predicate`, which returns a boolean value indicates whether the
+    condition is met. Once the condition is met, the trigger will then call
+    `next(action)` to do next action and obtain the returned value.
 
     Args:
-        action (iterable): An iterable which does the action and possibly
-            returns a value.
-        default: The value returned after :attr:`action` stops iteration. If
-            not provided, the trigger will do nothing when StopIteration
+        action (iterable): An iterable which iteratively does the action and
+            possibly returns a value.
+        default (optional): The value returned after :attr:`action` exhausted.
+            If not provided, the trigger will do nothing when `StopIteration`
             occurs.
     """
 
     def __init__(self, action, default=DEFAULT_ACTION):
-        """action is an iterator that iteratively do a sequence of action and
-        return result values. default is used as result value when action is
-        exhausted.
-        """
         self._action = iter(action)
         self._default = default
         self._triggered_times = 0
 
     def _predicate(self, *args, **kwargs):
-        """This function returns True when we think we should do something.
+        """This function returns True when the condition is met and we should
+        do something.
         """
         raise NotImplementedError
 
@@ -84,6 +81,9 @@ def _make_state(self, names):
 
     @property
     def _state_names(self):
+        """Returns a list of names of attributes of the trigger object that can
+        be saved and restored as trigger state.
+        """
         return ['_triggered_times']
 
     @property
@@ -110,9 +110,30 @@ def restore_from_state(self, state):
             self._next_action()
 
     def save_to_pickle(self, file):
+        """Write a pickled representation of the state of the trigger to the
+        open file-like object :attr:`file`.
+
+        Args:
+            file: The open file-like object to which we write. As described in
+                pickle official document, it must have a `write()` method that
+                accepts a single string argument.
+        """
         pickle.dump(self.state, file)
 
     def restore_from_pickle(self, file):
+        """Read a string from the open file-like object :attr:`file` and
+        restore the trigger state from it.
+        Note that this function will call `next(action)` for the exact times
+        that the :py:attr:`state` records how many times `next(action)` had
+        been called. The user should be aware of any possible side effect of
+        this behavior.
+
+        Args:
+            file: The open file-like object from which we read. As described in
+                pickle official document, it must have a `read()` method that
+                takes an integer argument, and a `readline()` method that
+                requires no arguments, and both methods should return a string.
+        """
         self.restore_from_state(pickle.load(file))
 
 
@@ -137,6 +158,25 @@ def _predicate(self, step):
 
 
 class BestEverConvergenceTrigger(Trigger):
+    """A trigger that maintains the best value of a metric. It triggers when
+    the best value of the metric has not been updated for at least
+    :attr:`threshold_steps`. In order to avoid it triggers two frequently, it
+    will not trigger again within :attr:`minimum_interval_steps` once it
+    triggers.
+
+    Args:
+        action (iterable): An iterable which iteratively does the action and
+            possibly returns a value.
+        threshold_steps (int): Number of steps it should trigger after the best
+            value was last updated.
+        minimum_interval_steps (int): Minimum number of steps between twice
+            firing of the trigger.
+        default (optional): The value returned after :attr:`action` exhausted.
+            If not provided, the trigger will do nothing when `StopIteration`
+            occurs.
+    .. document private functions
+    .. automethod:: __call__
+    """
 
     def __init__(self, action, threshold_steps, minimum_interval_steps,
                  default=DEFAULT_ACTION):
@@ -160,6 +200,18 @@ def _predicate(self, step, score):
             return True
         return False
 
+    def __call__(self, step, score):
+        """The trigger must be called to update the current training step
+        (:attr:`step`) and the current value of the maintained metric
+        (:attr:`score`).
+
+        Args:
+            step (int): Current training step to update. The training step must
+                be updated in ascending order.
+            score (float): Current value of the maintained metric.
+        """
+        return super(BestEverConvergenceTrigger, self).__call__(step, score)
+
     @property
     def _state_names(self):
         return super(BestEverConvergenceTrigger, self)._state_names + [

From 0d3e18755dc5b3be1537a126a5926d800432d17e Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Sun, 14 Oct 2018 21:33:40 -0400
Subject: [PATCH 33/65] update README.md

---
 examples/differentiable_expected_bleu/README.md | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/examples/differentiable_expected_bleu/README.md b/examples/differentiable_expected_bleu/README.md
index 9b481c5e..eb7bcf9e 100644
--- a/examples/differentiable_expected_bleu/README.md
+++ b/examples/differentiable_expected_bleu/README.md
@@ -1,20 +1,14 @@
 # Seq2seq Model #
 
-This example builds an attentional seq2seq model for machine translation.
-
-## Usage ##
+This example builds an attentional seq2seq model for machine translation trained with Differentiable Expected BLEU (DEBLEU) and Teacher Mask. See https://openreview.net/pdf?id=S1x2aiRqFX for the implemented paper.
 
 ### Dataset ###
 
-Two example datasets are provided:
-
-  * toy_copy: A small toy autoencoding dataset from [TF Seq2seq toolkit](https://github.com/google/seq2seq/tree/2500c26add91b079ca00cf1f091db5a99ddab9ae).
   * iwslt14: The benchmark [IWSLT2014](https://sites.google.com/site/iwsltevaluation2014/home) (de-en) machine translation dataset. 
 
 Download the data with the following cmds:
 
 ```
-python prepare_data.py --data toy_copy
 python prepare_data.py --data iwslt14
 ```
 
@@ -23,18 +17,17 @@ python prepare_data.py --data iwslt14
 Train the model with the following cmd:
 
 ```
-python seq2seq_attn.py --config_model config_model --config_data config_toy_copy
+python differentiable_expected_bleu.py --config_model config_model --config_data config_iwslt14 --config_train config_train --pretrain_epochs 8
 ```
 
 Here:
   * `--config_model` specifies the model config. Note not to include the `.py` suffix.
   * `--config_data` specifies the data config.
+  * `--config_train` specifies the training config.
+  * `--pretrain_epochs` specifies the number of epochs to pretrain with cross-entropy loss.
 
 [config_model.py](./config_model.py) specifies a single-layer seq2seq model with Luong attention and bi-directional RNN encoder. Hyperparameters taking default values can be omitted from the config file. 
 
-For demonstration purpose, [config_model_full.py](./config_model_full.py) gives all possible hyperparameters for the model. The two config files will lead to the same model.
-
 ## Results ##
 
-On the IWSLT14 dataset, using original target texts as reference(no  `<UNK>`  in the reference), the model achieves `BLEU=21.66` within `10` epochs.
-
+On the IWSLT14 dataset, the model achieves `BLEU=25.35` after annealed all masks, while the cross-entropy trained model achieves `BLEU=24.57`.

From b09578554f730f2f18cbce09e39cba26b5dae3a5 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sun, 14 Oct 2018 23:00:26 -0400
Subject: [PATCH 34/65] rename some filenames ; add val/test datasets

---
 .../differentiable_expected_bleu/README.md    |  2 +-
 ...wslt14.py => config_data_iwslt14_de-en.py} | 17 ++++----
 .../config_data_iwslt14_en-fr.py              | 43 +++++++++++++++++++
 ...train.py => config_train_iwslt14_de-en.py} |  2 +-
 .../differentiable_expected_bleu.py           |  9 ++--
 .../prepare_data.py                           | 17 +++-----
 6 files changed, 65 insertions(+), 25 deletions(-)
 rename examples/differentiable_expected_bleu/{config_iwslt14.py => config_data_iwslt14_de-en.py} (63%)
 create mode 100644 examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py
 rename examples/differentiable_expected_bleu/{config_train.py => config_train_iwslt14_de-en.py} (96%)

diff --git a/examples/differentiable_expected_bleu/README.md b/examples/differentiable_expected_bleu/README.md
index eb7bcf9e..0e1d2ad2 100644
--- a/examples/differentiable_expected_bleu/README.md
+++ b/examples/differentiable_expected_bleu/README.md
@@ -9,7 +9,7 @@ This example builds an attentional seq2seq model for machine translation trained
 Download the data with the following cmds:
 
 ```
-python prepare_data.py --data iwslt14
+python prepare_data.py --data de-en
 ```
 
 ### Train the model ###
diff --git a/examples/differentiable_expected_bleu/config_iwslt14.py b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
similarity index 63%
rename from examples/differentiable_expected_bleu/config_iwslt14.py
rename to examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
index cfcc6d71..bbe3954b 100644
--- a/examples/differentiable_expected_bleu/config_iwslt14.py
+++ b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
@@ -1,16 +1,16 @@
-source_vocab_file = 'data/iwslt14/vocab.de'
-target_vocab_file = 'data/iwslt14/vocab.en'
+source_vocab_file = 'data/iwslt14_de-en/vocab.de'
+target_vocab_file = 'data/iwslt14_de-en/vocab.en'
 
 train = {
     'batch_size': 80,
     'allow_smaller_final_batch': False,
     'source_dataset': {
-        "files": 'data/iwslt14/train.de',
+        "files": 'data/iwslt14_de-en/train.de',
         'vocab_file': source_vocab_file,
         'max_seq_length': 50
     },
     'target_dataset': {
-        'files': 'data/iwslt14/train.en',
+        'files': 'data/iwslt14_de-en/train.en',
         'vocab_file': target_vocab_file,
         'max_seq_length': 50
     },
@@ -19,11 +19,11 @@
     'batch_size': 80,
     'shuffle': False,
     'source_dataset': {
-        "files": 'data/iwslt14/valid.de',
+        "files": 'data/iwslt14_de-en/valid.de',
         'vocab_file': source_vocab_file,
     },
     'target_dataset': {
-        'files': 'data/iwslt14/valid.en',
+        'files': 'data/iwslt14_de-en/valid.en',
         'vocab_file': target_vocab_file,
     },
 }
@@ -31,12 +31,11 @@
     'batch_size': 80,
     'shuffle': False,
     'source_dataset': {
-        "files": 'data/iwslt14/test.de',
+        "files": 'data/iwslt14_de-en/test.de',
         'vocab_file': source_vocab_file,
     },
     'target_dataset': {
-        'files': 'data/iwslt14/test.en',
+        'files': 'data/iwslt14_de-en/test.en',
         'vocab_file': target_vocab_file,
     },
 }
-val = test
diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py b/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py
new file mode 100644
index 00000000..2ebe3b40
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py
@@ -0,0 +1,43 @@
+source_vocab_file = 'data/iwslt14_en-fr/vocab.en'
+target_vocab_file = 'data/iwslt14_en-fr/vocab.fr'
+
+train = {
+    'batch_size': 80,
+    'allow_smaller_final_batch': False,
+    'source_dataset': {
+        "files": 'data/iwslt14_en-fr/train.en',
+        'vocab_file': source_vocab_file,
+        'max_seq_length': 50
+    },
+    'target_dataset': {
+        'files': 'data/iwslt14_en-fr/train.fr',
+        'vocab_file': target_vocab_file,
+        'max_seq_length': 50
+    },
+}
+
+val = {
+    'batch_size': 80,
+    'shuffle': False,
+    'source_dataset': {
+        "files": 'data/iwslt14_en-fr/valid.en',
+        'vocab_file': source_vocab_file,
+    },
+    'target_dataset': {
+        'files': 'data/iwslt14_en-fr/valid.fr',
+        'vocab_file': target_vocab_file,
+    },
+}
+
+test = {
+    'batch_size': 80,
+    'shuffle': False,
+    'source_dataset': {
+        "files": 'data/iwslt14_en-fr/test.en',
+        'vocab_file': source_vocab_file,
+    },
+    'target_dataset': {
+        'files': 'data/iwslt14_en-fr/test.fr',
+        'vocab_file': target_vocab_file,
+    },
+}
diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py
similarity index 96%
rename from examples/differentiable_expected_bleu/config_train.py
rename to examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py
index ccdde330..1e37fe9a 100644
--- a/examples/differentiable_expected_bleu/config_train.py
+++ b/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py
@@ -42,4 +42,4 @@
     },
 }
 
-expr_name = 'train'
+expr_name = 'iwslt14_de-en'
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index ee12b3dc..52bab578 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -27,9 +27,9 @@
 
 flags = tf.flags
 
-flags.DEFINE_string("config_train", "config_train", "The training config.")
+flags.DEFINE_string("config_train", "config_train_iwslt14_de-en", "The training config.")
 flags.DEFINE_string("config_model", "config_model", "The model config.")
-flags.DEFINE_string("config_data", "config_iwslt14", "The dataset config.")
+flags.DEFINE_string("config_data", "config_data_iwslt14_de-en", "The dataset config.")
 flags.DEFINE_integer("pretrain_epochs", 8, "Number of pretraining epochs.")
 
 FLAGS = flags.FLAGS
@@ -254,9 +254,10 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
                 epoch, ' (pretraining)' if pretraining else ''))
 
             val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger)
+            test_bleu = _eval_epoch(sess, summary_writer, 'test', trigger)
             step = tf.train.global_step(sess, global_step)
-            print('epoch: {}, step: {}, val bleu: {}'.format(
-                epoch, step, val_bleu))
+            print('epoch: {}, step: {}, val bleu: {}, test bleu: {}'.format(
+                epoch, step, val_bleu, test_bleu))
 
             if val_bleu > best_val_bleu:
                 best_val_bleu = val_bleu
diff --git a/examples/differentiable_expected_bleu/prepare_data.py b/examples/differentiable_expected_bleu/prepare_data.py
index a5cc357b..a7557c09 100644
--- a/examples/differentiable_expected_bleu/prepare_data.py
+++ b/examples/differentiable_expected_bleu/prepare_data.py
@@ -16,31 +16,28 @@
 import tensorflow as tf
 import texar as tx
 
+import os
+
 # pylint: disable=invalid-name
 
 flags = tf.flags
 
-flags.DEFINE_string("data", "iwslt14", "Data to download [iwslt14|toy_copy]")
+flags.DEFINE_string("data", "de-en", "Data to download [de-en|en-fr]")
 
 FLAGS = flags.FLAGS
 
 def prepare_data():
     """Downloads data.
     """
-    if FLAGS.data == 'iwslt14':
+    if FLAGS.data == 'de-en':
         tx.data.maybe_download(
             urls='https://drive.google.com/file/d/'
                  '1Vuv3bed10qUxrpldHdYoiWLzPKa4pNXd/view?usp=sharing',
             path='./',
-            filenames='iwslt14.zip',
-            extract=True)
-    elif FLAGS.data == 'toy_copy':
-        tx.data.maybe_download(
-            urls='https://drive.google.com/file/d/'
-                 '1fENE2rakm8vJ8d3voWBgW4hGlS6-KORW/view?usp=sharing',
-            path='./',
-            filenames='toy_copy.zip',
+            filenames='iwslt14_de-en.zip',
             extract=True)
+        os.rename(os.path.join('data', 'iwslt14'),
+                  os.path.join('data', 'iwslt14_de-en'))
     else:
         raise ValueError('Unknown data: {}'.format(FLAGS.data))
 

From 06c572714c6afd7126d3ae574bd76ca012a7da28 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sun, 14 Oct 2018 23:20:04 -0400
Subject: [PATCH 35/65] add config_train_iwslt14_en-fr.py

---
 .../config_train_iwslt14_en-fr.py             | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py

diff --git a/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py b/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py
new file mode 100644
index 00000000..e3751956
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py
@@ -0,0 +1,45 @@
+max_epochs = 1000
+steps_per_eval = 500
+tau = 1.
+infer_beam_width = 1
+infer_max_decoding_length = 50
+
+mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
+threshold_steps = 10000
+minimum_interval_steps = 10000
+
+train_xe = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+    },
+    "learning_rate_decay": {
+        "type": "piecewise_constant",
+        "kwargs": {
+            "boundaries": [10000],
+            "values": [1e-3, 1e-5],
+        },
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+}
+
+train_debleu = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": 1e-5,
+        }
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+}
+
+expr_name = 'iwslt14_en-fr'

From 5305d3864dc1b5c6a0fbb1d766d2c2d7a82864e1 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sun, 14 Oct 2018 23:21:27 -0400
Subject: [PATCH 36/65] update README.md

---
 examples/differentiable_expected_bleu/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/differentiable_expected_bleu/README.md b/examples/differentiable_expected_bleu/README.md
index 0e1d2ad2..ad5d685c 100644
--- a/examples/differentiable_expected_bleu/README.md
+++ b/examples/differentiable_expected_bleu/README.md
@@ -17,7 +17,7 @@ python prepare_data.py --data de-en
 Train the model with the following cmd:
 
 ```
-python differentiable_expected_bleu.py --config_model config_model --config_data config_iwslt14 --config_train config_train --pretrain_epochs 8
+python differentiable_expected_bleu.py --config_model config_model --config_data config_iwslt14_de-en --config_train config_train_iwslt14_de-en --pretrain_epochs 8
 ```
 
 Here:

From 3aab0a624089004d478522a570953ed8746a608d Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Mon, 15 Oct 2018 20:08:36 -0400
Subject: [PATCH 37/65] replace moses bleu by nltk bleu

---
 .../differentiable_expected_bleu.py                   | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 52bab578..eeba8e28 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -25,6 +25,8 @@
 import tensorflow as tf
 import texar as tx
 
+from nltk.translate.bleu_score import corpus_bleu
+
 flags = tf.flags
 
 flags.DEFINE_string("config_train", "config_train_iwslt14_de-en", "The training config.")
@@ -181,9 +183,11 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
         while True:
             try:
                 target_texts_ori, output_ids = sess.run(fetches, feed_dict)
-                target_texts = tx.utils.strip_special_tokens(target_texts_ori)
+                target_texts = tx.utils.strip_special_tokens(
+                    target_texts_ori.tolist(), is_token_list=True)
                 output_texts = tx.utils.map_ids_to_strs(
-                    ids=output_ids, vocab=val_data.target_vocab)
+                    ids=output_ids.tolist(), vocab=val_data.target_vocab,
+                    join=False)
 
                 ref_hypo_pairs.extend(
                     zip(map(lambda x: [x], target_texts), output_texts))
@@ -192,8 +196,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
                 break
 
         refs, hypos = zip(*ref_hypo_pairs)
-        bleu = tx.evals.corpus_bleu_moses(list_of_references=refs,
-                                          hypotheses=hypos)
+        bleu = corpus_bleu(refs, hypos) * 100
         print('{} BLEU: {}'.format(mode, bleu))
 
         step = tf.train.global_step(sess, global_step)

From 8ca85a9bcb9e85ea4ea36ddae69287b57d617a32 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Tue, 16 Oct 2018 02:11:10 -0400
Subject: [PATCH 38/65] modify model

---
 .../config_model.py                           | 12 ++++++++++-
 .../differentiable_expected_bleu.py           | 20 +++++++++++++++++--
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model.py
index 3ba0c867..16dba9b9 100644
--- a/examples/differentiable_expected_bleu/config_model.py
+++ b/examples/differentiable_expected_bleu/config_model.py
@@ -7,13 +7,23 @@
 embedder = {
     'dim': embedding_dim
 }
+
 encoder = {
     'rnn_cell_fw': {
         'kwargs': {
             'num_units': num_units
-        }
+        },
+        'num_layers': 2
+    },
+    'output_layer_fw': {
+        'dropout_rate': 0
     }
 }
+
+connector = {
+    'activation_fn': 'tanh'
+}
+
 decoder = {
     'rnn_cell': {
         'kwargs': {
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index eeba8e28..904410bd 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -54,7 +54,8 @@ def build_model(batch, train_data):
     encoder = tx.modules.BidirectionalRNNEncoder(
         hparams=config_model.encoder)
 
-    enc_outputs, _ = encoder(source_embedder(batch['source_text_ids']))
+    enc_outputs, enc_final_state = encoder(
+        source_embedder(batch['source_text_ids']))
 
     target_embedder = tx.modules.WordEmbedder(
         vocab_size=train_data.target_vocab.size, hparams=config_model.embedder)
@@ -65,9 +66,23 @@ def build_model(batch, train_data):
         vocab_size=train_data.target_vocab.size,
         hparams=config_model.decoder)
 
+    enc_final_state = tf.contrib.framework.nest.map_structure(
+        lambda *args: tf.concat(args, -1), *enc_final_state)
+
+    if isinstance(decoder.cell, tf.nn.rnn_cell.LSTMCell):
+        connector = tx.modules.MLPTransformConnector(
+            decoder.state_size.h, hparams=config_model.connector)
+        dec_initial_h = connector(enc_final_state.h)
+        dec_initial_state = (dec_initial_h, enc_final_state.c)
+    else:
+        connector = tx.modules.MLPTransformConnector(
+            decoder.state_size, hparams=config_model.connector)
+        dec_initial_state = connector(enc_final_state)
+
     # cross-entropy + teacher-forcing pretraining
     tf_outputs, _, _ = decoder(
         decoding_strategy='train_greedy',
+        initial_state=dec_initial_state,
         inputs=target_embedder(batch['target_text_ids'][:, :-1]),
         sequence_length=batch['target_length']-1)
 
@@ -91,7 +106,8 @@ def build_model(batch, train_data):
         tau=config_train.tau)
 
     tm_outputs, _, _ = decoder(
-        helper=tm_helper)
+        helper=tm_helper,
+        initial_state=dec_initial_state)
 
     loss_debleu = tx.losses.debleu(
         labels=batch['target_text_ids'][:, 1:],

From 78b6994ce222f4598b7ad7605ba9125290e41dca Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Tue, 16 Oct 2018 22:56:51 -0400
Subject: [PATCH 39/65] refine models

---
 .../config_model.py                           |  4 ---
 .../config_train_iwslt14_de-en.py             |  2 --
 .../config_train_iwslt14_en-fr.py             |  2 --
 .../differentiable_expected_bleu.py           | 31 ++++++-------------
 4 files changed, 9 insertions(+), 30 deletions(-)

diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model.py
index 16dba9b9..3b7a8da7 100644
--- a/examples/differentiable_expected_bleu/config_model.py
+++ b/examples/differentiable_expected_bleu/config_model.py
@@ -20,10 +20,6 @@
     }
 }
 
-connector = {
-    'activation_fn': 'tanh'
-}
-
 decoder = {
     'rnn_cell': {
         'kwargs': {
diff --git a/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py b/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py
index 1e37fe9a..2b057887 100644
--- a/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py
+++ b/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py
@@ -41,5 +41,3 @@
         },
     },
 }
-
-expr_name = 'iwslt14_de-en'
diff --git a/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py b/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py
index e3751956..2b057887 100644
--- a/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py
+++ b/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py
@@ -41,5 +41,3 @@
         },
     },
 }
-
-expr_name = 'iwslt14_en-fr'
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 904410bd..efe1e255 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -29,9 +29,13 @@
 
 flags = tf.flags
 
-flags.DEFINE_string("config_train", "config_train_iwslt14_de-en", "The training config.")
+flags.DEFINE_string("config_train", "config_train_iwslt14_de-en",
+                    "The training config.")
 flags.DEFINE_string("config_model", "config_model", "The model config.")
-flags.DEFINE_string("config_data", "config_data_iwslt14_de-en", "The dataset config.")
+flags.DEFINE_string("config_data", "config_data_iwslt14_de-en",
+                    "The dataset config.")
+flags.DEFINE_string("expr_name", "iwslt14_de-en", "The experiment name. "
+                    "Also used as the directory name of run.")
 flags.DEFINE_integer("pretrain_epochs", 8, "Number of pretraining epochs.")
 
 FLAGS = flags.FLAGS
@@ -39,9 +43,8 @@
 config_train = importlib.import_module(FLAGS.config_train)
 config_model = importlib.import_module(FLAGS.config_model)
 config_data = importlib.import_module(FLAGS.config_data)
+expr_name = FLAGS.expr_name
 pretrain_epochs = FLAGS.pretrain_epochs
-
-expr_name = config_train.expr_name
 mask_patterns = config_train.mask_patterns
 
 
@@ -54,8 +57,7 @@ def build_model(batch, train_data):
     encoder = tx.modules.BidirectionalRNNEncoder(
         hparams=config_model.encoder)
 
-    enc_outputs, enc_final_state = encoder(
-        source_embedder(batch['source_text_ids']))
+    enc_outputs, _ = encoder(source_embedder(batch['source_text_ids']))
 
     target_embedder = tx.modules.WordEmbedder(
         vocab_size=train_data.target_vocab.size, hparams=config_model.embedder)
@@ -66,23 +68,9 @@ def build_model(batch, train_data):
         vocab_size=train_data.target_vocab.size,
         hparams=config_model.decoder)
 
-    enc_final_state = tf.contrib.framework.nest.map_structure(
-        lambda *args: tf.concat(args, -1), *enc_final_state)
-
-    if isinstance(decoder.cell, tf.nn.rnn_cell.LSTMCell):
-        connector = tx.modules.MLPTransformConnector(
-            decoder.state_size.h, hparams=config_model.connector)
-        dec_initial_h = connector(enc_final_state.h)
-        dec_initial_state = (dec_initial_h, enc_final_state.c)
-    else:
-        connector = tx.modules.MLPTransformConnector(
-            decoder.state_size, hparams=config_model.connector)
-        dec_initial_state = connector(enc_final_state)
-
     # cross-entropy + teacher-forcing pretraining
     tf_outputs, _, _ = decoder(
         decoding_strategy='train_greedy',
-        initial_state=dec_initial_state,
         inputs=target_embedder(batch['target_text_ids'][:, :-1]),
         sequence_length=batch['target_length']-1)
 
@@ -106,8 +94,7 @@ def build_model(batch, train_data):
         tau=config_train.tau)
 
     tm_outputs, _, _ = decoder(
-        helper=tm_helper,
-        initial_state=dec_initial_state)
+        helper=tm_helper)
 
     loss_debleu = tx.losses.debleu(
         labels=batch['target_text_ids'][:, 1:],

From 82bc6a8561f792e4fb6ad8f2ed5a3108b2c9b5d3 Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Wed, 17 Oct 2018 23:31:10 -0400
Subject: [PATCH 40/65] refine summary ; batch_size=160

---
 .../config_data_iwslt14_de-en.py              |  8 ++--
 .../config_data_iwslt14_en-fr.py              |  8 ++--
 .../config_train_expd1e-2_xe.py               | 46 +++++++++++++++++++
 .../config_train_expd2e-2_xe.py               | 46 +++++++++++++++++++
 .../config_train_expd5e-3_xe.py               | 46 +++++++++++++++++++
 .../differentiable_expected_bleu.py           | 29 ++++++++----
 6 files changed, 168 insertions(+), 15 deletions(-)
 create mode 100644 examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py
 create mode 100644 examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py
 create mode 100644 examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py

diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
index bbe3954b..fb03a8bb 100644
--- a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
+++ b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
@@ -1,8 +1,10 @@
 source_vocab_file = 'data/iwslt14_de-en/vocab.de'
 target_vocab_file = 'data/iwslt14_de-en/vocab.en'
 
+batch_size = 160
+
 train = {
-    'batch_size': 80,
+    'batch_size': batch_size,
     'allow_smaller_final_batch': False,
     'source_dataset': {
         "files": 'data/iwslt14_de-en/train.de',
@@ -16,7 +18,7 @@
     },
 }
 val = {
-    'batch_size': 80,
+    'batch_size': batch_size,
     'shuffle': False,
     'source_dataset': {
         "files": 'data/iwslt14_de-en/valid.de',
@@ -28,7 +30,7 @@
     },
 }
 test = {
-    'batch_size': 80,
+    'batch_size': batch_size,
     'shuffle': False,
     'source_dataset': {
         "files": 'data/iwslt14_de-en/test.de',
diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py b/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py
index 2ebe3b40..a81090e6 100644
--- a/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py
+++ b/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py
@@ -1,8 +1,10 @@
 source_vocab_file = 'data/iwslt14_en-fr/vocab.en'
 target_vocab_file = 'data/iwslt14_en-fr/vocab.fr'
 
+batch_size = 160
+
 train = {
-    'batch_size': 80,
+    'batch_size': batch_size,
     'allow_smaller_final_batch': False,
     'source_dataset': {
         "files": 'data/iwslt14_en-fr/train.en',
@@ -17,7 +19,7 @@
 }
 
 val = {
-    'batch_size': 80,
+    'batch_size': batch_size,
     'shuffle': False,
     'source_dataset': {
         "files": 'data/iwslt14_en-fr/valid.en',
@@ -30,7 +32,7 @@
 }
 
 test = {
-    'batch_size': 80,
+    'batch_size': batch_size,
     'shuffle': False,
     'source_dataset': {
         "files": 'data/iwslt14_en-fr/test.en',
diff --git a/examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py b/examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py
new file mode 100644
index 00000000..69331564
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py
@@ -0,0 +1,46 @@
+max_epochs = 1000
+steps_per_eval = 500
+tau = 1.
+infer_beam_width = 1
+infer_max_decoding_length = 50
+
+mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
+threshold_steps = 10000
+minimum_interval_steps = 10000
+
+train_xe = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+    },
+    "learning_rate_decay": {
+        "type": "exponential_decay",
+        "kwargs": {
+            "decay_steps": 10000,
+            "decay_rate": 1e-2,
+        },
+        "min_learning_rate": 1e-5,
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+    "name": "XE"
+}
+
+train_debleu = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": 1e-5,
+        }
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+    "name": "DEBLEU"
+}
diff --git a/examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py b/examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py
new file mode 100644
index 00000000..d5e9759a
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py
@@ -0,0 +1,46 @@
+max_epochs = 1000
+steps_per_eval = 500
+tau = 1.
+infer_beam_width = 1
+infer_max_decoding_length = 50
+
+mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
+threshold_steps = 10000
+minimum_interval_steps = 10000
+
+train_xe = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+    },
+    "learning_rate_decay": {
+        "type": "exponential_decay",
+        "kwargs": {
+            "decay_steps": 10000,
+            "decay_rate": 2e-2,
+        },
+        "min_learning_rate": 1e-5,
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+    "name": "XE"
+}
+
+train_debleu = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": 1e-5,
+        }
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+    "name": "DEBLEU"
+}
diff --git a/examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py b/examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py
new file mode 100644
index 00000000..613369fd
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py
@@ -0,0 +1,46 @@
+max_epochs = 1000
+steps_per_eval = 500
+tau = 1.
+infer_beam_width = 1
+infer_max_decoding_length = 50
+
+mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
+threshold_steps = 10000
+minimum_interval_steps = 10000
+
+train_xe = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+    },
+    "learning_rate_decay": {
+        "type": "exponential_decay",
+        "kwargs": {
+            "decay_steps": 10000,
+            "decay_rate": 5e-3,
+        },
+        "min_learning_rate": 1e-5,
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+    "name": "XE"
+}
+
+train_debleu = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": 1e-5,
+        }
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+    "name": "DEBLEU"
+}
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index efe1e255..5e052096 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -137,14 +137,23 @@ def main():
     train_xe_op, train_debleu_op, tm_helper, infer_outputs = \
         build_model(data_batch, train_data)
 
-    tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask)
-    tf.summary.scalar('tm/n_mask', tm_helper.n_mask)
-
-    merged_summary = tf.summary.merge_all()
+    summary_tm = [
+        tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask),
+        tf.summary.scalar('tm/n_mask', tm_helper.n_mask)]
+    summary_xe_op = tf.summary.merge(
+        tf.get_collection(
+            tf.GraphKeys.SUMMARIES,
+            scope='/'.join(train_xe_op.name.split('/')[:-1])),
+        name='summary_xe')
+    summary_debleu_op = tf.summary.merge(
+        tf.get_collection(
+            tf.GraphKeys.SUMMARIES,
+            scope='/'.join(train_xe_op.name.split('/')[:-1])) + summary_tm,
+        name='summary_debleu')
 
     saver = tf.train.Saver(max_to_keep=None)
 
-    def _train_epoch(sess, summary_writer, train_op, trigger):
+    def _train_epoch(sess, summary_writer, train_op, summary_op, trigger):
         print('in _train_epoch')
 
         data_iterator.restart_dataset(sess, 'train')
@@ -156,7 +165,7 @@ def _train_epoch(sess, summary_writer, train_op, trigger):
         while True:
             try:
                 loss, summary, step = sess.run(
-                    (train_op, merged_summary, global_step), feed_dict)
+                    (train_op, summary_op, global_step), feed_dict)
 
                 summary_writer.add_summary(summary, step)
 
@@ -279,9 +288,11 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
 
                 print('saved to {}'.format(saved_path))
 
-            train_op = train_xe_op if pretraining else train_debleu_op
-            _train_epoch(sess, summary_writer, train_op,
-                         None if pretraining else trigger)
+            train_op, summary_op, trigger_ = {
+                True: (train_xe_op, summary_xe_op, None),
+                False: (train_debleu_op, summary_debleu_op, trigger)
+            }[pretraining]
+            _train_epoch(sess, summary_writer, train_op, summary_op, trigger_)
             epoch += 1
 
             step = tf.train.global_step(sess, global_step)

From fffd6486b618d1587ed11584c6dd48c58f677fa0 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-45-252.us-west-2.compute.internal>
Date: Thu, 18 Oct 2018 19:37:16 +0000
Subject: [PATCH 41/65] remove exponetial decay configs ; fix summary bug

---
 .../config_train_expd1e-2_xe.py               | 46 -------------------
 .../config_train_expd2e-2_xe.py               | 46 -------------------
 .../config_train_expd5e-3_xe.py               | 46 -------------------
 .../config_train_iwslt14_de-en.py             |  4 +-
 .../config_train_iwslt14_en-fr.py             |  4 +-
 .../differentiable_expected_bleu.py           |  8 +++-
 6 files changed, 12 insertions(+), 142 deletions(-)
 delete mode 100644 examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py
 delete mode 100644 examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py
 delete mode 100644 examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py

diff --git a/examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py b/examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py
deleted file mode 100644
index 69331564..00000000
--- a/examples/differentiable_expected_bleu/config_train_expd1e-2_xe.py
+++ /dev/null
@@ -1,46 +0,0 @@
-max_epochs = 1000
-steps_per_eval = 500
-tau = 1.
-infer_beam_width = 1
-infer_max_decoding_length = 50
-
-mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
-threshold_steps = 10000
-minimum_interval_steps = 10000
-
-train_xe = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-    },
-    "learning_rate_decay": {
-        "type": "exponential_decay",
-        "kwargs": {
-            "decay_steps": 10000,
-            "decay_rate": 1e-2,
-        },
-        "min_learning_rate": 1e-5,
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "XE"
-}
-
-train_debleu = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 1e-5,
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "DEBLEU"
-}
diff --git a/examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py b/examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py
deleted file mode 100644
index d5e9759a..00000000
--- a/examples/differentiable_expected_bleu/config_train_expd2e-2_xe.py
+++ /dev/null
@@ -1,46 +0,0 @@
-max_epochs = 1000
-steps_per_eval = 500
-tau = 1.
-infer_beam_width = 1
-infer_max_decoding_length = 50
-
-mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
-threshold_steps = 10000
-minimum_interval_steps = 10000
-
-train_xe = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-    },
-    "learning_rate_decay": {
-        "type": "exponential_decay",
-        "kwargs": {
-            "decay_steps": 10000,
-            "decay_rate": 2e-2,
-        },
-        "min_learning_rate": 1e-5,
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "XE"
-}
-
-train_debleu = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 1e-5,
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "DEBLEU"
-}
diff --git a/examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py b/examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py
deleted file mode 100644
index 613369fd..00000000
--- a/examples/differentiable_expected_bleu/config_train_expd5e-3_xe.py
+++ /dev/null
@@ -1,46 +0,0 @@
-max_epochs = 1000
-steps_per_eval = 500
-tau = 1.
-infer_beam_width = 1
-infer_max_decoding_length = 50
-
-mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
-threshold_steps = 10000
-minimum_interval_steps = 10000
-
-train_xe = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-    },
-    "learning_rate_decay": {
-        "type": "exponential_decay",
-        "kwargs": {
-            "decay_steps": 10000,
-            "decay_rate": 5e-3,
-        },
-        "min_learning_rate": 1e-5,
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "XE"
-}
-
-train_debleu = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 1e-5,
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "DEBLEU"
-}
diff --git a/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py b/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py
index 2b057887..3ce1d904 100644
--- a/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py
+++ b/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py
@@ -15,7 +15,7 @@
     "learning_rate_decay": {
         "type": "piecewise_constant",
         "kwargs": {
-            "boundaries": [10000],
+            "boundaries": [100000],
             "values": [1e-3, 1e-5],
         },
     },
@@ -25,6 +25,7 @@
             "clip_norm": 5.
         },
     },
+    "name": "XE"
 }
 
 train_debleu = {
@@ -40,4 +41,5 @@
             "clip_norm": 5.
         },
     },
+    "name": "DEBLEU"
 }
diff --git a/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py b/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py
index 2b057887..3ce1d904 100644
--- a/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py
+++ b/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py
@@ -15,7 +15,7 @@
     "learning_rate_decay": {
         "type": "piecewise_constant",
         "kwargs": {
-            "boundaries": [10000],
+            "boundaries": [100000],
             "values": [1e-3, 1e-5],
         },
     },
@@ -25,6 +25,7 @@
             "clip_norm": 5.
         },
     },
+    "name": "XE"
 }
 
 train_debleu = {
@@ -40,4 +41,5 @@
             "clip_norm": 5.
         },
     },
+    "name": "DEBLEU"
 }
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 5e052096..dc924bc8 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -48,6 +48,10 @@
 mask_patterns = config_train.mask_patterns
 
 
+def get_scope_by_name(tensor):
+    return tensor.name[: tensor.name.rfind('/') + 1]
+
+
 def build_model(batch, train_data):
     """Assembles the seq2seq model.
     """
@@ -143,12 +147,12 @@ def main():
     summary_xe_op = tf.summary.merge(
         tf.get_collection(
             tf.GraphKeys.SUMMARIES,
-            scope='/'.join(train_xe_op.name.split('/')[:-1])),
+            scope=get_scope_by_name(train_xe_op)),
         name='summary_xe')
     summary_debleu_op = tf.summary.merge(
         tf.get_collection(
             tf.GraphKeys.SUMMARIES,
-            scope='/'.join(train_xe_op.name.split('/')[:-1])) + summary_tm,
+            scope=get_scope_by_name(train_debleu_op)) + summary_tm,
         name='summary_debleu')
 
     saver = tf.train.Saver(max_to_keep=None)

From 6d07aa16c821f9f3bfdb9f152b3a6aa600c5d15d Mon Sep 17 00:00:00 2001
From: Wentao Wang <wwt10@pku.edu.cn>
Date: Fri, 19 Oct 2018 02:36:43 +0000
Subject: [PATCH 42/65] add stages

---
 .../config_en-fr_xe_1e3_xe_1e5_debleu.py      | 41 -----------------
 .../config_train_iwslt14_de-en.py             | 45 -------------------
 .../config_train_iwslt14_en-fr.py             | 45 -------------------
 .../differentiable_expected_bleu.py           | 39 ++++++++++------
 4 files changed, 25 insertions(+), 145 deletions(-)
 delete mode 100644 examples/differentiable_expected_bleu/config_en-fr_xe_1e3_xe_1e5_debleu.py
 delete mode 100644 examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py
 delete mode 100644 examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py

diff --git a/examples/differentiable_expected_bleu/config_en-fr_xe_1e3_xe_1e5_debleu.py b/examples/differentiable_expected_bleu/config_en-fr_xe_1e3_xe_1e5_debleu.py
deleted file mode 100644
index 07acbea8..00000000
--- a/examples/differentiable_expected_bleu/config_en-fr_xe_1e3_xe_1e5_debleu.py
+++ /dev/null
@@ -1,41 +0,0 @@
-max_epochs = 1000
-steps_per_eval = 500
-tau = 1.
-infer_beam_width = 1
-infer_max_decoding_length = 50
-
-mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
-threshold_steps = 10000
-wait_steps = 10000
-
-train_xe = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 1e-5,
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-}
-
-train_debleu = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 1e-5,
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-}
-
-expr_name = 'en-fr_xe_1e3_xe_1e5_debleu'
diff --git a/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py b/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py
deleted file mode 100644
index 3ce1d904..00000000
--- a/examples/differentiable_expected_bleu/config_train_iwslt14_de-en.py
+++ /dev/null
@@ -1,45 +0,0 @@
-max_epochs = 1000
-steps_per_eval = 500
-tau = 1.
-infer_beam_width = 1
-infer_max_decoding_length = 50
-
-mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
-threshold_steps = 10000
-minimum_interval_steps = 10000
-
-train_xe = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-    },
-    "learning_rate_decay": {
-        "type": "piecewise_constant",
-        "kwargs": {
-            "boundaries": [100000],
-            "values": [1e-3, 1e-5],
-        },
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "XE"
-}
-
-train_debleu = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 1e-5,
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "DEBLEU"
-}
diff --git a/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py b/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py
deleted file mode 100644
index 3ce1d904..00000000
--- a/examples/differentiable_expected_bleu/config_train_iwslt14_en-fr.py
+++ /dev/null
@@ -1,45 +0,0 @@
-max_epochs = 1000
-steps_per_eval = 500
-tau = 1.
-infer_beam_width = 1
-infer_max_decoding_length = 50
-
-mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
-threshold_steps = 10000
-minimum_interval_steps = 10000
-
-train_xe = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-    },
-    "learning_rate_decay": {
-        "type": "piecewise_constant",
-        "kwargs": {
-            "boundaries": [100000],
-            "values": [1e-3, 1e-5],
-        },
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "XE"
-}
-
-train_debleu = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 1e-5,
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "DEBLEU"
-}
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index dc924bc8..15e1991b 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -36,7 +36,8 @@
                     "The dataset config.")
 flags.DEFINE_string("expr_name", "iwslt14_de-en", "The experiment name. "
                     "Also used as the directory name of run.")
-flags.DEFINE_integer("pretrain_epochs", 8, "Number of pretraining epochs.")
+flags.DEFINE_integer("pretrain_epochs", 10000, "Number of pretraining epochs.")
+flags.DEFINE_string("stage", "xe0", "stage.")
 
 FLAGS = flags.FLAGS
 
@@ -45,6 +46,7 @@
 config_data = importlib.import_module(FLAGS.config_data)
 expr_name = FLAGS.expr_name
 pretrain_epochs = FLAGS.pretrain_epochs
+stage = FLAGS.stage
 mask_patterns = config_train.mask_patterns
 
 
@@ -83,9 +85,13 @@ def build_model(batch, train_data):
         logits=tf_outputs.logits,
         sequence_length=batch['target_length']-1)
 
-    train_xe_op = tx.core.get_train_op(
+    train_xe0_op = tx.core.get_train_op(
         loss_xe,
-        hparams=config_train.train_xe)
+        hparams=config_train.train_xe0)
+
+    train_xe1_op = tx.core.get_train_op(
+        loss_xe,
+        hparams=config_train.train_xe1)
 
     # teacher mask + DEBLEU fine-tuning
     tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper(
@@ -122,7 +128,7 @@ def build_model(batch, train_data):
         beam_width=config_train.infer_beam_width,
         max_decoding_length=config_train.infer_max_decoding_length)
 
-    return train_xe_op, train_debleu_op, tm_helper, bs_outputs
+    return train_xe0_op, train_xe1_op, train_debleu_op, tm_helper, bs_outputs
 
 
 def main():
@@ -138,16 +144,21 @@ def main():
 
     global_step = tf.train.create_global_step()
 
-    train_xe_op, train_debleu_op, tm_helper, infer_outputs = \
+    train_xe0_op, train_xe1_op, train_debleu_op, tm_helper, infer_outputs = \
         build_model(data_batch, train_data)
 
     summary_tm = [
         tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask),
         tf.summary.scalar('tm/n_mask', tm_helper.n_mask)]
-    summary_xe_op = tf.summary.merge(
+    summary_xe0_op = tf.summary.merge(
+        tf.get_collection(
+            tf.GraphKeys.SUMMARIES,
+            scope=get_scope_by_name(train_xe0_op)),
+        name='summary_xe')
+    summary_xe1_op = tf.summary.merge(
         tf.get_collection(
             tf.GraphKeys.SUMMARIES,
-            scope=get_scope_by_name(train_xe_op)),
+            scope=get_scope_by_name(train_xe1_op)),
         name='summary_xe')
     summary_debleu_op = tf.summary.merge(
         tf.get_collection(
@@ -268,9 +279,8 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
 
         epoch = 0
         while epoch < config_train.max_epochs:
-            pretraining = epoch < pretrain_epochs
             print('epoch #{}{}:'.format(
-                epoch, ' (pretraining)' if pretraining else ''))
+                epoch, ' ({})'.format(stage)))
 
             val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger)
             test_bleu = _eval_epoch(sess, summary_writer, 'test', trigger)
@@ -285,7 +295,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
                 saved_path = saver.save(
                     sess, ckpt_best, global_step=step)
 
-                if not pretraining:
+                if stage == 'debleu':
                     with open('{}.trigger'.format(saved_path), 'w') as \
                             pickle_file:
                         trigger.save_to_pickle(pickle_file)
@@ -293,16 +303,17 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
                 print('saved to {}'.format(saved_path))
 
             train_op, summary_op, trigger_ = {
-                True: (train_xe_op, summary_xe_op, None),
-                False: (train_debleu_op, summary_debleu_op, trigger)
-            }[pretraining]
+                'xe0': (train_xe0_op, summary_xe0_op, None),
+                'xe1': (train_xe1_op, summary_xe1_op, None),
+                'debleu': (train_debleu_op, summary_debleu_op, trigger)
+            }[stage]
             _train_epoch(sess, summary_writer, train_op, summary_op, trigger_)
             epoch += 1
 
             step = tf.train.global_step(sess, global_step)
             saved_path = saver.save(sess, ckpt_model, global_step=step)
 
-            if not pretraining:
+            if stage == 'debleu':
                 with open('{}.trigger'.format(saved_path), 'w') as pickle_file:
                     trigger.save_to_pickle(pickle_file)
 

From 923ea8cab258ff43555e468358bc2734b186649b Mon Sep 17 00:00:00 2001
From: Wentao Wang <wwt10@pku.edu.cn>
Date: Fri, 19 Oct 2018 02:38:32 +0000
Subject: [PATCH 43/65] add config_train

---
 .../config_train.py                           | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 examples/differentiable_expected_bleu/config_train.py

diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py
new file mode 100644
index 00000000..1d55dd40
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_train.py
@@ -0,0 +1,57 @@
+max_epochs = 1000
+steps_per_eval = 500
+tau = 1.
+infer_beam_width = 1
+infer_max_decoding_length = 50
+
+mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
+threshold_steps = 10000
+minimum_interval_steps = 10000
+
+train_xe0 = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": 1e-3
+        }
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+    "name": "XE_0"
+}
+
+train_xe1 = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": 1e-5
+        }
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+    "name": "XE_1"
+}
+
+train_debleu = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": 1e-5,
+        }
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+    "name": "DEBLEU"
+}

From 56b44c7cc7e7ccce000692e14922c4127daadfaa Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Fri, 19 Oct 2018 22:41:44 -0400
Subject: [PATCH 44/65] modify 2-layer encoder to 1-layer

---
 examples/differentiable_expected_bleu/config_model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model.py
index 3b7a8da7..55afef49 100644
--- a/examples/differentiable_expected_bleu/config_model.py
+++ b/examples/differentiable_expected_bleu/config_model.py
@@ -13,7 +13,6 @@
         'kwargs': {
             'num_units': num_units
         },
-        'num_layers': 2
     },
     'output_layer_fw': {
         'dropout_rate': 0

From c6991c8027778b214c65df8dadcb3ac0c1ea3b9b Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sat, 20 Oct 2018 01:25:57 -0400
Subject: [PATCH 45/65] change configs to bowen's

---
 .../config_data_iwslt14_de-en.py                           | 2 ++
 examples/differentiable_expected_bleu/config_model.py      | 7 ++-----
 examples/differentiable_expected_bleu/config_train.py      | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
index fb03a8bb..cf0c645f 100644
--- a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
+++ b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
@@ -17,6 +17,7 @@
         'max_seq_length': 50
     },
 }
+
 val = {
     'batch_size': batch_size,
     'shuffle': False,
@@ -29,6 +30,7 @@
         'vocab_file': target_vocab_file,
     },
 }
+
 test = {
     'batch_size': batch_size,
     'shuffle': False,
diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model.py
index 55afef49..125b1fc6 100644
--- a/examples/differentiable_expected_bleu/config_model.py
+++ b/examples/differentiable_expected_bleu/config_model.py
@@ -1,8 +1,8 @@
 # Attentional Seq2seq model.
 # Hyperparameters not specified here will take the default values.
 
-num_units = 1000
-embedding_dim = 500
+num_units = 256
+embedding_dim = 256
 
 embedder = {
     'dim': embedding_dim
@@ -14,9 +14,6 @@
             'num_units': num_units
         },
     },
-    'output_layer_fw': {
-        'dropout_rate': 0
-    }
 }
 
 decoder = {
diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py
index 1d55dd40..2ecf2210 100644
--- a/examples/differentiable_expected_bleu/config_train.py
+++ b/examples/differentiable_expected_bleu/config_train.py
@@ -1,7 +1,7 @@
 max_epochs = 1000
 steps_per_eval = 500
 tau = 1.
-infer_beam_width = 1
+infer_beam_width = 10
 infer_max_decoding_length = 50
 
 mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]

From 7e89acf59c04569e0a0f435f5e940f3a8168cc7f Mon Sep 17 00:00:00 2001
From: Wentao Wang <wwt10@pku.edu.cn>
Date: Sat, 20 Oct 2018 17:48:18 -0400
Subject: [PATCH 46/65] open trigger file in binary mode

---
 .../differentiable_expected_bleu.py                           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 15e1991b..7a4744d4 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -267,7 +267,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
 
             trigger_path = '{}.trigger'.format(ckpt_path)
             if os.path.exists(trigger_path):
-                with open(trigger_path, 'r') as pickle_file:
+                with open(trigger_path, 'rb') as pickle_file:
                     trigger.restore_from_pickle(pickle_file)
             else:
                 print('cannot find previous trigger state.')
@@ -296,7 +296,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
                     sess, ckpt_best, global_step=step)
 
                 if stage == 'debleu':
-                    with open('{}.trigger'.format(saved_path), 'w') as \
+                    with open('{}.trigger'.format(saved_path), 'wb') as \
                             pickle_file:
                         trigger.save_to_pickle(pickle_file)
 

From de78471aa4badf0f0406251ebd40b80b126c1cf5 Mon Sep 17 00:00:00 2001
From: Wentao Wang <wwt10@pku.edu.cn>
Date: Sat, 20 Oct 2018 18:18:41 -0400
Subject: [PATCH 47/65] add binary mode

---
 .../differentiable_expected_bleu.py                             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 7a4744d4..7529afe8 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -314,7 +314,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
             saved_path = saver.save(sess, ckpt_model, global_step=step)
 
             if stage == 'debleu':
-                with open('{}.trigger'.format(saved_path), 'w') as pickle_file:
+                with open('{}.trigger'.format(saved_path), 'wb') as pickle_file:
                     trigger.save_to_pickle(pickle_file)
 
             print('saved to {}'.format(saved_path))

From b822f393e9dbeb215f8c3225c8fe8ca4ced5adde Mon Sep 17 00:00:00 2001
From: Wentao Wang <wwt10@pku.edu.cn>
Date: Sun, 21 Oct 2018 22:37:13 -0400
Subject: [PATCH 48/65] use new datasets ; reinitialize optimizer when
 annealing ; modify config_train.py ; replace tf.Variable by tf.get_variable
 in TeacherMaskTrainingHelper

---
 .../config_train.py                           | 26 ++-------
 .../differentiable_expected_bleu.py           | 58 ++++++++++++-------
 .../prepare_data.py                           |  2 +-
 texar/modules/decoders/rnn_decoder_helpers.py | 10 ++--
 4 files changed, 49 insertions(+), 47 deletions(-)

diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py
index 2ecf2210..bb80f0ca 100644
--- a/examples/differentiable_expected_bleu/config_train.py
+++ b/examples/differentiable_expected_bleu/config_train.py
@@ -1,18 +1,18 @@
 max_epochs = 1000
-steps_per_eval = 500
+steps_per_eval = int(1e9)
 tau = 1.
 infer_beam_width = 10
 infer_max_decoding_length = 50
 
 mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
-threshold_steps = 10000
+threshold_steps = 25000
 minimum_interval_steps = 10000
 
-train_xe0 = {
+train_xe = {
     "optimizer": {
         "type": "AdamOptimizer",
         "kwargs": {
-            "learning_rate": 1e-3
+            "learning_rate": [1e-3, 1e-5]
         }
     },
     "gradient_clip": {
@@ -21,23 +21,7 @@
             "clip_norm": 5.
         },
     },
-    "name": "XE_0"
-}
-
-train_xe1 = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 1e-5
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "XE_1"
+    "name": "XE"
 }
 
 train_debleu = {
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 7529afe8..90e26172 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -29,7 +29,7 @@
 
 flags = tf.flags
 
-flags.DEFINE_string("config_train", "config_train_iwslt14_de-en",
+flags.DEFINE_string("config_train", "config_train",
                     "The training config.")
 flags.DEFINE_string("config_model", "config_model", "The model config.")
 flags.DEFINE_string("config_data", "config_data_iwslt14_de-en",
@@ -38,6 +38,8 @@
                     "Also used as the directory name of run.")
 flags.DEFINE_integer("pretrain_epochs", 10000, "Number of pretraining epochs.")
 flags.DEFINE_string("stage", "xe0", "stage.")
+flags.DEFINE_boolean("reinitialize_optimizer", False, "Whether to reinitialize "
+                     "optimizer state before training.")
 
 FLAGS = flags.FLAGS
 
@@ -47,8 +49,13 @@
 expr_name = FLAGS.expr_name
 pretrain_epochs = FLAGS.pretrain_epochs
 stage = FLAGS.stage
+reinitialize_optimizer = FLAGS.reinitialize_optimizer
 mask_patterns = config_train.mask_patterns
 
+if stage.startswith("xe"):
+    d = config_train.train_xe["optimizer"]["kwargs"]
+    d["learning_rate"] = d["learning_rate"][int(stage[2:])]
+
 
 def get_scope_by_name(tensor):
     return tensor.name[: tensor.name.rfind('/') + 1]
@@ -85,13 +92,9 @@ def build_model(batch, train_data):
         logits=tf_outputs.logits,
         sequence_length=batch['target_length']-1)
 
-    train_xe0_op = tx.core.get_train_op(
-        loss_xe,
-        hparams=config_train.train_xe0)
-
-    train_xe1_op = tx.core.get_train_op(
+    train_xe_op = tx.core.get_train_op(
         loss_xe,
-        hparams=config_train.train_xe1)
+        hparams=config_train.train_xe)
 
     # teacher mask + DEBLEU fine-tuning
     tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper(
@@ -128,7 +131,7 @@ def build_model(batch, train_data):
         beam_width=config_train.infer_beam_width,
         max_decoding_length=config_train.infer_max_decoding_length)
 
-    return train_xe0_op, train_xe1_op, train_debleu_op, tm_helper, bs_outputs
+    return train_xe_op, train_debleu_op, tm_helper, bs_outputs
 
 
 def main():
@@ -144,21 +147,26 @@ def main():
 
     global_step = tf.train.create_global_step()
 
-    train_xe0_op, train_xe1_op, train_debleu_op, tm_helper, infer_outputs = \
+    train_xe_op, train_debleu_op, tm_helper, infer_outputs = \
         build_model(data_batch, train_data)
 
+    train_xe_op_initializer, train_debleu_op_initializer = [
+        tf.variables_initializer(
+            tf.get_collection(
+                tf.GraphKeys.GLOBAL_VARIABLES,
+                scope=get_scope_by_name(train_op)),
+            name=name)
+        for train_op, name in [
+            (train_xe_op, "train_xe_op_initializer"),
+            (train_debleu_op, "train_debleu_op_initializer")]]
+
     summary_tm = [
         tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask),
         tf.summary.scalar('tm/n_mask', tm_helper.n_mask)]
-    summary_xe0_op = tf.summary.merge(
+    summary_xe_op = tf.summary.merge(
         tf.get_collection(
             tf.GraphKeys.SUMMARIES,
-            scope=get_scope_by_name(train_xe0_op)),
-        name='summary_xe')
-    summary_xe1_op = tf.summary.merge(
-        tf.get_collection(
-            tf.GraphKeys.SUMMARIES,
-            scope=get_scope_by_name(train_xe1_op)),
+            scope=get_scope_by_name(train_xe_op)),
         name='summary_xe')
     summary_debleu_op = tf.summary.merge(
         tf.get_collection(
@@ -243,8 +251,12 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
 
     best_val_bleu = -1
     with tf.Session() as sess:
-        action = (tm_helper.assign_mask_pattern(sess, n_unmask, n_mask)
-                  for n_unmask, n_mask in mask_patterns[1:])
+        def action_of_mask(mask_pattern):
+            sess.run(train_debleu_op_initializer)
+            tm_helper.assign_mask_pattern(sess, *mask_pattern)
+
+        action = (action_of_mask(mask_pattern)
+                  for mask_pattern in mask_patterns[1:])
         trigger = tx.utils.BestEverConvergenceTrigger(
             action,
             config_train.threshold_steps,
@@ -265,6 +277,10 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
             print('restoring from {} ...'.format(ckpt_path))
             saver.restore(sess, ckpt_path)
 
+            if reinitialize_optimizer:
+                sess.run(train_xe_op_initializer)
+                sess.run(train_debleu_op_initializer)
+
             trigger_path = '{}.trigger'.format(ckpt_path)
             if os.path.exists(trigger_path):
                 with open(trigger_path, 'rb') as pickle_file:
@@ -283,7 +299,7 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
                 epoch, ' ({})'.format(stage)))
 
             val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger)
-            test_bleu = _eval_epoch(sess, summary_writer, 'test', trigger)
+            test_bleu = _eval_epoch(sess, summary_writer, 'test', None)
             step = tf.train.global_step(sess, global_step)
             print('epoch: {}, step: {}, val bleu: {}, test bleu: {}'.format(
                 epoch, step, val_bleu, test_bleu))
@@ -303,8 +319,8 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
                 print('saved to {}'.format(saved_path))
 
             train_op, summary_op, trigger_ = {
-                'xe0': (train_xe0_op, summary_xe0_op, None),
-                'xe1': (train_xe1_op, summary_xe1_op, None),
+                'xe0': (train_xe_op, summary_xe_op, None),
+                'xe1': (train_xe_op, summary_xe_op, None),
                 'debleu': (train_debleu_op, summary_debleu_op, trigger)
             }[stage]
             _train_epoch(sess, summary_writer, train_op, summary_op, trigger_)
diff --git a/examples/differentiable_expected_bleu/prepare_data.py b/examples/differentiable_expected_bleu/prepare_data.py
index a7557c09..8a19075b 100644
--- a/examples/differentiable_expected_bleu/prepare_data.py
+++ b/examples/differentiable_expected_bleu/prepare_data.py
@@ -32,7 +32,7 @@ def prepare_data():
     if FLAGS.data == 'de-en':
         tx.data.maybe_download(
             urls='https://drive.google.com/file/d/'
-                 '1Vuv3bed10qUxrpldHdYoiWLzPKa4pNXd/view?usp=sharing',
+                 '1y4mUWXRS2KstgHopCS9koZ42ENOh6Yb9/view?usp=sharing',
             path='./',
             filenames='iwslt14_de-en.zip',
             extract=True)
diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py
index f8e7040e..dba5087c 100644
--- a/texar/modules/decoders/rnn_decoder_helpers.py
+++ b/texar/modules/decoders/rnn_decoder_helpers.py
@@ -410,10 +410,12 @@ def __init__(self, inputs, sequence_length, embedding, n_unmask,
             self._zero_next_inputs = tf.zeros_like(
                 self._embedding_fn(self._zero_inputs))
 
-            self._n_unmask = tf.Variable(n_unmask, name='n_unmask')
-            self._n_mask = tf.Variable(n_mask, name='n_mask')
+            self._n_unmask = tf.get_variable(
+                "n_unmask", initializer=n_unmask, trainable=False)
+            self._n_mask = tf.get_variable(
+                "n_mask", initializer=n_mask, trainable=False)
             self._n_cycle = tf.add(
-                self._n_unmask, self._n_mask, name='n_cycle')
+                self._n_unmask, self._n_mask, name="n_cycle")
             self._new_n_unmask = tf.placeholder(shape=[], dtype=tf.int32)
             self._new_n_mask = tf.placeholder(shape=[], dtype=tf.int32)
             self._assign_n_unmask = tf.assign(
@@ -421,7 +423,7 @@ def __init__(self, inputs, sequence_length, embedding, n_unmask,
             self._assign_n_mask = tf.assign(self._n_mask, self._new_n_mask)
             self._n_shift = tf.random_uniform(
                 [], maxval=self._n_cycle, dtype=self._n_cycle.dtype,
-                seed=self._seed, name='n_shift')
+                seed=self._seed, name="n_shift")
 
     @property
     def sample_ids_dtype(self):

From 9d6e4bb1f5d948aff5577928c3f31bb91ad465c3 Mon Sep 17 00:00:00 2001
From: Wentao Wang <wwt10@pku.edu.cn>
Date: Sun, 21 Oct 2018 22:50:50 -0400
Subject: [PATCH 49/65] replace name_scope by variable_scope in
 TeacherMaskSoftmaxEmbeddingHelper

---
 texar/modules/decoders/rnn_decoder_helpers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py
index dba5087c..3ff9d419 100644
--- a/texar/modules/decoders/rnn_decoder_helpers.py
+++ b/texar/modules/decoders/rnn_decoder_helpers.py
@@ -394,8 +394,8 @@ class TeacherMaskSoftmaxEmbeddingHelper(TFTrainingHelper):
     def __init__(self, inputs, sequence_length, embedding, n_unmask,
                  n_mask, tau=1., time_major=False, seed=None,
                  stop_gradient=False, name=None):
-        with tf.name_scope(name, "TeacherMaskSoftmaxEmbeddingHelper",
-                           [embedding, tau, seed, stop_gradient]):
+        with tf.variable_scope(name, "TeacherMaskSoftmaxEmbeddingHelper",
+                               [embedding, tau, seed, stop_gradient]):
             super(TeacherMaskSoftmaxEmbeddingHelper, self).__init__(
                 inputs=inputs,
                 sequence_length=sequence_length,

From ed1f6f3ec218c77ec13ffd17a75009e5dedf6aad Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Mon, 22 Oct 2018 14:03:23 -0400
Subject: [PATCH 50/65] fix lr bug

---
 .../differentiable_expected_bleu.py                           | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 90e26172..c4de2695 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -52,9 +52,11 @@
 reinitialize_optimizer = FLAGS.reinitialize_optimizer
 mask_patterns = config_train.mask_patterns
 
+d = config_train.train_xe["optimizer"]["kwargs"]
 if stage.startswith("xe"):
-    d = config_train.train_xe["optimizer"]["kwargs"]
     d["learning_rate"] = d["learning_rate"][int(stage[2:])]
+else:
+    d["learning_rate"] = d["learning_rate"][-1]
 
 
 def get_scope_by_name(tensor):

From 3cfc217598c0aa9cc8a107a792689f21616cfa1e Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Sun, 28 Oct 2018 23:35:58 -0400
Subject: [PATCH 51/65] reset model and configs to those in pytorch codes ; fix
 connector bug ; move best checkpoint into evaluation

---
 .../config_data_iwslt14_de-en.py              |  2 +-
 .../config_data_iwslt14_en-fr.py              |  2 +-
 .../config_model.py                           | 12 +++-
 .../config_train.py                           |  8 +--
 .../differentiable_expected_bleu.py           | 63 ++++++++++++-------
 5 files changed, 58 insertions(+), 29 deletions(-)

diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
index cf0c645f..a3236629 100644
--- a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
+++ b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
@@ -1,7 +1,7 @@
 source_vocab_file = 'data/iwslt14_de-en/vocab.de'
 target_vocab_file = 'data/iwslt14_de-en/vocab.en'
 
-batch_size = 160
+batch_size = 80
 
 train = {
     'batch_size': batch_size,
diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py b/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py
index a81090e6..4c4482f7 100644
--- a/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py
+++ b/examples/differentiable_expected_bleu/config_data_iwslt14_en-fr.py
@@ -1,7 +1,7 @@
 source_vocab_file = 'data/iwslt14_en-fr/vocab.en'
 target_vocab_file = 'data/iwslt14_en-fr/vocab.fr'
 
-batch_size = 160
+batch_size = 80
 
 train = {
     'batch_size': batch_size,
diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model.py
index 125b1fc6..16dba9b9 100644
--- a/examples/differentiable_expected_bleu/config_model.py
+++ b/examples/differentiable_expected_bleu/config_model.py
@@ -1,8 +1,8 @@
 # Attentional Seq2seq model.
 # Hyperparameters not specified here will take the default values.
 
-num_units = 256
-embedding_dim = 256
+num_units = 1000
+embedding_dim = 500
 
 embedder = {
     'dim': embedding_dim
@@ -13,7 +13,15 @@
         'kwargs': {
             'num_units': num_units
         },
+        'num_layers': 2
     },
+    'output_layer_fw': {
+        'dropout_rate': 0
+    }
+}
+
+connector = {
+    'activation_fn': 'tanh'
 }
 
 decoder = {
diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py
index bb80f0ca..19bfdfa7 100644
--- a/examples/differentiable_expected_bleu/config_train.py
+++ b/examples/differentiable_expected_bleu/config_train.py
@@ -1,11 +1,11 @@
 max_epochs = 1000
-steps_per_eval = int(1e9)
+steps_per_eval = 500
 tau = 1.
-infer_beam_width = 10
+infer_beam_width = 1
 infer_max_decoding_length = 50
 
-mask_patterns = [(2, 2), (4, 2), (8, 2), (1, 0)]
-threshold_steps = 25000
+mask_patterns = [(2, 2), (4, 2), (1, 0)]
+threshold_steps = int(1e9)
 minimum_interval_steps = 10000
 
 train_xe = {
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index c4de2695..053a53e7 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -72,7 +72,8 @@ def build_model(batch, train_data):
     encoder = tx.modules.BidirectionalRNNEncoder(
         hparams=config_model.encoder)
 
-    enc_outputs, _ = encoder(source_embedder(batch['source_text_ids']))
+    enc_outputs, enc_final_state = encoder(
+        source_embedder(batch['source_text_ids']))
 
     target_embedder = tx.modules.WordEmbedder(
         vocab_size=train_data.target_vocab.size, hparams=config_model.embedder)
@@ -83,9 +84,23 @@ def build_model(batch, train_data):
         vocab_size=train_data.target_vocab.size,
         hparams=config_model.decoder)
 
+    enc_final_state = tf.contrib.framework.nest.map_structure(
+        lambda *args: tf.concat(args, -1), *enc_final_state)
+
+    if isinstance(decoder.cell, tf.nn.rnn_cell.LSTMCell):
+        connector = tx.modules.MLPTransformConnector(
+            decoder.state_size.h, hparams=config_model.connector)
+        dec_initial_h = connector(enc_final_state.h)
+        dec_initial_state = (dec_initial_h, enc_final_state.c)
+    else:
+        connector = tx.modules.MLPTransformConnector(
+            decoder.state_size, hparams=config_model.connector)
+        dec_initial_state = connector(enc_final_state)
+
     # cross-entropy + teacher-forcing pretraining
     tf_outputs, _, _ = decoder(
         decoding_strategy='train_greedy',
+        initial_state=dec_initial_state,
         inputs=target_embedder(batch['target_text_ids'][:, :-1]),
         sequence_length=batch['target_length']-1)
 
@@ -109,7 +124,8 @@ def build_model(batch, train_data):
         tau=config_train.tau)
 
     tm_outputs, _, _ = decoder(
-        helper=tm_helper)
+        helper=tm_helper,
+        initial_state=dec_initial_state)
 
     loss_debleu = tx.losses.debleu(
         labels=batch['target_text_ids'][:, 1:],
@@ -130,6 +146,7 @@ def build_model(batch, train_data):
         embedding=target_embedder,
         start_tokens=start_tokens,
         end_token=end_token,
+        initial_state=dec_initial_state,
         beam_width=config_train.infer_beam_width,
         max_decoding_length=config_train.infer_max_decoding_length)
 
@@ -178,6 +195,9 @@ def main():
 
     saver = tf.train.Saver(max_to_keep=None)
 
+    global best_val_bleu
+    best_val_bleu = -1
+
     def _train_epoch(sess, summary_writer, train_op, summary_op, trigger):
         print('in _train_epoch')
 
@@ -243,15 +263,30 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
         summary_writer.add_summary(summary, step)
         summary_writer.flush()
 
-        if trigger is not None:
-            triggered, _ = trigger(step, bleu)
-            if triggered:
-                print('triggered!')
+        if mode == 'val':
+            if trigger is not None:
+                triggered, _ = trigger(step, bleu)
+                if triggered:
+                    print('triggered!')
+
+            global best_val_bleu
+            if bleu > best_val_bleu:
+                best_val_bleu = bleu
+                print('update best val bleu: {}'.format(best_val_bleu))
+
+                saved_path = saver.save(
+                    sess, ckpt_best, global_step=step)
+
+                if stage == 'debleu':
+                    with open('{}.trigger'.format(saved_path), 'wb') as \
+                            pickle_file:
+                        trigger.save_to_pickle(pickle_file)
+
+                print('saved to {}'.format(saved_path))
 
         print('end _eval_epoch')
         return bleu
 
-    best_val_bleu = -1
     with tf.Session() as sess:
         def action_of_mask(mask_pattern):
             sess.run(train_debleu_op_initializer)
@@ -306,20 +341,6 @@ def action_of_mask(mask_pattern):
             print('epoch: {}, step: {}, val bleu: {}, test bleu: {}'.format(
                 epoch, step, val_bleu, test_bleu))
 
-            if val_bleu > best_val_bleu:
-                best_val_bleu = val_bleu
-                print('update best val bleu: {}'.format(best_val_bleu))
-
-                saved_path = saver.save(
-                    sess, ckpt_best, global_step=step)
-
-                if stage == 'debleu':
-                    with open('{}.trigger'.format(saved_path), 'wb') as \
-                            pickle_file:
-                        trigger.save_to_pickle(pickle_file)
-
-                print('saved to {}'.format(saved_path))
-
             train_op, summary_op, trigger_ = {
                 'xe0': (train_xe_op, summary_xe_op, None),
                 'xe1': (train_xe_op, summary_xe_op, None),

From 18da2c7fa739b4c127eed56e0105d2c4cfcec9c8 Mon Sep 17 00:00:00 2001
From: Wentao Wang <wwt10@pku.edu.cn>
Date: Mon, 29 Oct 2018 15:33:48 -0400
Subject: [PATCH 52/65] anneal to bs160 4:2 mask ; reinitialize mask after
 restoring

---
 .../config_data_iwslt14_de-en_bs160.py        | 45 +++++++++++++++++++
 .../config_train_4_2.py                       | 41 +++++++++++++++++
 .../differentiable_expected_bleu.py           |  4 ++
 3 files changed, 90 insertions(+)
 create mode 100644 examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py
 create mode 100644 examples/differentiable_expected_bleu/config_train_4_2.py

diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py
new file mode 100644
index 00000000..cf0c645f
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py
@@ -0,0 +1,45 @@
+source_vocab_file = 'data/iwslt14_de-en/vocab.de'
+target_vocab_file = 'data/iwslt14_de-en/vocab.en'
+
+batch_size = 160
+
+train = {
+    'batch_size': batch_size,
+    'allow_smaller_final_batch': False,
+    'source_dataset': {
+        "files": 'data/iwslt14_de-en/train.de',
+        'vocab_file': source_vocab_file,
+        'max_seq_length': 50
+    },
+    'target_dataset': {
+        'files': 'data/iwslt14_de-en/train.en',
+        'vocab_file': target_vocab_file,
+        'max_seq_length': 50
+    },
+}
+
+val = {
+    'batch_size': batch_size,
+    'shuffle': False,
+    'source_dataset': {
+        "files": 'data/iwslt14_de-en/valid.de',
+        'vocab_file': source_vocab_file,
+    },
+    'target_dataset': {
+        'files': 'data/iwslt14_de-en/valid.en',
+        'vocab_file': target_vocab_file,
+    },
+}
+
+test = {
+    'batch_size': batch_size,
+    'shuffle': False,
+    'source_dataset': {
+        "files": 'data/iwslt14_de-en/test.de',
+        'vocab_file': source_vocab_file,
+    },
+    'target_dataset': {
+        'files': 'data/iwslt14_de-en/test.en',
+        'vocab_file': target_vocab_file,
+    },
+}
diff --git a/examples/differentiable_expected_bleu/config_train_4_2.py b/examples/differentiable_expected_bleu/config_train_4_2.py
new file mode 100644
index 00000000..4c8cc276
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_train_4_2.py
@@ -0,0 +1,41 @@
+max_epochs = 1000
+steps_per_eval = 500
+tau = 1.
+infer_beam_width = 1
+infer_max_decoding_length = 50
+
+mask_patterns = [(4, 2), (1, 0)]
+threshold_steps = int(1e9)
+minimum_interval_steps = 10000
+
+train_xe = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": [1e-3, 1e-5]
+        }
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+    "name": "XE"
+}
+
+train_debleu = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": 1e-5,
+        }
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+    "name": "DEBLEU"
+}
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 053a53e7..c1e05851 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -179,6 +179,9 @@ def main():
             (train_xe_op, "train_xe_op_initializer"),
             (train_debleu_op, "train_debleu_op_initializer")]]
 
+    tm_helper_initializer = tf.variables_initializer(
+        [tm_helper.n_unmask, tm_helper.n_mask], name="tm_helper_initializer")
+
     summary_tm = [
         tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask),
         tf.summary.scalar('tm/n_mask', tm_helper.n_mask)]
@@ -317,6 +320,7 @@ def action_of_mask(mask_pattern):
             if reinitialize_optimizer:
                 sess.run(train_xe_op_initializer)
                 sess.run(train_debleu_op_initializer)
+                sess.run(tm_helper_initializer)
 
             trigger_path = '{}.trigger'.format(ckpt_path)
             if os.path.exists(trigger_path):

From 1ac619dfbbbde41eaadef8badff91c8193cb427f Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Mon, 29 Oct 2018 23:48:43 -0400
Subject: [PATCH 53/65] add lr1e6_1_0.py config

---
 .../config_train_lr1e6_1_0.py                 | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py

diff --git a/examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py b/examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py
new file mode 100644
index 00000000..faad17d7
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py
@@ -0,0 +1,41 @@
+max_epochs = 1000
+steps_per_eval = 500
+tau = 1.
+infer_beam_width = 1
+infer_max_decoding_length = 50
+
+mask_patterns = [(1, 0)]
+threshold_steps = int(1e9)
+minimum_interval_steps = 10000
+
+train_xe = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": [1e-3, 1e-5]
+        }
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+    "name": "XE"
+}
+
+train_debleu = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": 1e-6,
+        }
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        },
+    },
+    "name": "DEBLEU"
+}

From c227c28e30705d3444ddada27c4ebb049cacdff4 Mon Sep 17 00:00:00 2001
From: Wentao Wang <wwt10@pku.edu.cn>
Date: Thu, 1 Nov 2018 22:58:13 -0400
Subject: [PATCH 54/65] add more model configs

---
 ...{config_model.py => config_model_large.py} |  0
 .../config_model_medium.py                    | 40 +++++++++++++++++++
 .../differentiable_expected_bleu.py           | 34 +++++++++-------
 3 files changed, 60 insertions(+), 14 deletions(-)
 rename examples/differentiable_expected_bleu/{config_model.py => config_model_large.py} (100%)
 create mode 100644 examples/differentiable_expected_bleu/config_model_medium.py

diff --git a/examples/differentiable_expected_bleu/config_model.py b/examples/differentiable_expected_bleu/config_model_large.py
similarity index 100%
rename from examples/differentiable_expected_bleu/config_model.py
rename to examples/differentiable_expected_bleu/config_model_large.py
diff --git a/examples/differentiable_expected_bleu/config_model_medium.py b/examples/differentiable_expected_bleu/config_model_medium.py
new file mode 100644
index 00000000..7750a97c
--- /dev/null
+++ b/examples/differentiable_expected_bleu/config_model_medium.py
@@ -0,0 +1,40 @@
+# Attentional Seq2seq model.
+# Hyperparameters not specified here will take the default values.
+
+num_units = 256
+embedding_dim = 256
+dropout = 0.2
+
+embedder = {
+    'dim': embedding_dim
+}
+
+encoder = {
+    'rnn_cell_fw': {
+        'kwargs': {
+            'num_units': num_units
+        },
+        'dropout': {
+            'input_keep_prob': 1. - dropout
+        }
+    }
+}
+
+connector = None
+
+decoder = {
+    'rnn_cell': {
+        'kwargs': {
+            'num_units': num_units
+        },
+        'dropout': {
+            'input_keep_prob': 1. - dropout
+        }
+    },
+    'attention': {
+        'kwargs': {
+            'num_units': num_units,
+        },
+        'attention_layer_size': num_units
+    }
+}
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index c1e05851..17c5d55b 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -29,9 +29,8 @@
 
 flags = tf.flags
 
-flags.DEFINE_string("config_train", "config_train",
-                    "The training config.")
-flags.DEFINE_string("config_model", "config_model", "The model config.")
+flags.DEFINE_string("config_train", "config_train", "The training config.")
+flags.DEFINE_string("config_model", "config_model_medium", "The model config.")
 flags.DEFINE_string("config_data", "config_data_iwslt14_de-en",
                     "The dataset config.")
 flags.DEFINE_string("expr_name", "iwslt14_de-en", "The experiment name. "
@@ -84,18 +83,22 @@ def build_model(batch, train_data):
         vocab_size=train_data.target_vocab.size,
         hparams=config_model.decoder)
 
-    enc_final_state = tf.contrib.framework.nest.map_structure(
-        lambda *args: tf.concat(args, -1), *enc_final_state)
+    if config_model.connector is None:
+        dec_initial_state = None
 
-    if isinstance(decoder.cell, tf.nn.rnn_cell.LSTMCell):
-        connector = tx.modules.MLPTransformConnector(
-            decoder.state_size.h, hparams=config_model.connector)
-        dec_initial_h = connector(enc_final_state.h)
-        dec_initial_state = (dec_initial_h, enc_final_state.c)
     else:
-        connector = tx.modules.MLPTransformConnector(
-            decoder.state_size, hparams=config_model.connector)
-        dec_initial_state = connector(enc_final_state)
+        enc_final_state = tf.contrib.framework.nest.map_structure(
+            lambda *args: tf.concat(args, -1), *enc_final_state)
+
+        if isinstance(decoder.cell, tf.nn.rnn_cell.LSTMCell):
+            connector = tx.modules.MLPTransformConnector(
+                decoder.state_size.h, hparams=config_model.connector)
+            dec_initial_h = connector(enc_final_state.h)
+            dec_initial_state = (dec_initial_h, enc_final_state.c)
+        else:
+            connector = tx.modules.MLPTransformConnector(
+                decoder.state_size, hparams=config_model.connector)
+            dec_initial_state = connector(enc_final_state)
 
     # cross-entropy + teacher-forcing pretraining
     tf_outputs, _, _ = decoder(
@@ -342,7 +345,7 @@ def action_of_mask(mask_pattern):
             val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger)
             test_bleu = _eval_epoch(sess, summary_writer, 'test', None)
             step = tf.train.global_step(sess, global_step)
-            print('epoch: {}, step: {}, val bleu: {}, test bleu: {}'.format(
+            print('epoch: {}, step: {}, val BLEU: {}, test BLEU: {}'.format(
                 epoch, step, val_bleu, test_bleu))
 
             train_op, summary_op, trigger_ = {
@@ -362,6 +365,9 @@ def action_of_mask(mask_pattern):
 
             print('saved to {}'.format(saved_path))
 
+        test_bleu = _eval_epoch(sess, summary_writer, 'test', None)
+        print('test BLEU: {}'.format(test_bleu))
+
 
 if __name__ == '__main__':
     main()

From 316e41c452bb7b1727167f8441c11f7c0c137b8f Mon Sep 17 00:00:00 2001
From: Wentao Wang <wwt10@pku.edu.cn>
Date: Sat, 3 Nov 2018 02:03:13 -0400
Subject: [PATCH 55/65] refine code ; now everything is automatical

---
 .../config_data_iwslt14_de-en.py              |  24 +-
 .../config_data_iwslt14_de-en_bs160.py        |  45 ----
 .../config_train.py                           |  59 ++++-
 .../config_train_4_2.py                       |  41 ----
 .../config_train_lr1e6_1_0.py                 |  41 ----
 .../differentiable_expected_bleu.py           | 219 +++++++++---------
 texar/utils/triggers.py                       |  23 ++
 7 files changed, 203 insertions(+), 249 deletions(-)
 delete mode 100644 examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py
 delete mode 100644 examples/differentiable_expected_bleu/config_train_4_2.py
 delete mode 100644 examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py

diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
index a3236629..ae3979f5 100644
--- a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
+++ b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en.py
@@ -1,10 +1,23 @@
 source_vocab_file = 'data/iwslt14_de-en/vocab.de'
 target_vocab_file = 'data/iwslt14_de-en/vocab.en'
 
-batch_size = 80
+train_0 = {
+    'batch_size': 80,
+    'allow_smaller_final_batch': False,
+    'source_dataset': {
+        "files": 'data/iwslt14_de-en/train.de',
+        'vocab_file': source_vocab_file,
+        'max_seq_length': 50
+    },
+    'target_dataset': {
+        'files': 'data/iwslt14_de-en/train.en',
+        'vocab_file': target_vocab_file,
+        'max_seq_length': 50
+    },
+}
 
-train = {
-    'batch_size': batch_size,
+train_1 = {
+    'batch_size': 160,
     'allow_smaller_final_batch': False,
     'source_dataset': {
         "files": 'data/iwslt14_de-en/train.de',
@@ -18,8 +31,9 @@
     },
 }
 
+
 val = {
-    'batch_size': batch_size,
+    'batch_size': 80,
     'shuffle': False,
     'source_dataset': {
         "files": 'data/iwslt14_de-en/valid.de',
@@ -32,7 +46,7 @@
 }
 
 test = {
-    'batch_size': batch_size,
+    'batch_size': 80,
     'shuffle': False,
     'source_dataset': {
         "files": 'data/iwslt14_de-en/test.de',
diff --git a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py b/examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py
deleted file mode 100644
index cf0c645f..00000000
--- a/examples/differentiable_expected_bleu/config_data_iwslt14_de-en_bs160.py
+++ /dev/null
@@ -1,45 +0,0 @@
-source_vocab_file = 'data/iwslt14_de-en/vocab.de'
-target_vocab_file = 'data/iwslt14_de-en/vocab.en'
-
-batch_size = 160
-
-train = {
-    'batch_size': batch_size,
-    'allow_smaller_final_batch': False,
-    'source_dataset': {
-        "files": 'data/iwslt14_de-en/train.de',
-        'vocab_file': source_vocab_file,
-        'max_seq_length': 50
-    },
-    'target_dataset': {
-        'files': 'data/iwslt14_de-en/train.en',
-        'vocab_file': target_vocab_file,
-        'max_seq_length': 50
-    },
-}
-
-val = {
-    'batch_size': batch_size,
-    'shuffle': False,
-    'source_dataset': {
-        "files": 'data/iwslt14_de-en/valid.de',
-        'vocab_file': source_vocab_file,
-    },
-    'target_dataset': {
-        'files': 'data/iwslt14_de-en/valid.en',
-        'vocab_file': target_vocab_file,
-    },
-}
-
-test = {
-    'batch_size': batch_size,
-    'shuffle': False,
-    'source_dataset': {
-        "files": 'data/iwslt14_de-en/test.de',
-        'vocab_file': source_vocab_file,
-    },
-    'target_dataset': {
-        'files': 'data/iwslt14_de-en/test.en',
-        'vocab_file': target_vocab_file,
-    },
-}
diff --git a/examples/differentiable_expected_bleu/config_train.py b/examples/differentiable_expected_bleu/config_train.py
index 19bfdfa7..09d3464f 100644
--- a/examples/differentiable_expected_bleu/config_train.py
+++ b/examples/differentiable_expected_bleu/config_train.py
@@ -4,38 +4,77 @@
 infer_beam_width = 1
 infer_max_decoding_length = 50
 
-mask_patterns = [(2, 2), (4, 2), (1, 0)]
-threshold_steps = int(1e9)
+threshold_steps = 10000
 minimum_interval_steps = 10000
+phases = [
+    # (config_data, config_train, mask_pattern)
+    ("train_0", "xe_0", None),
+    ("train_0", "xe_1", None),
+    ("train_0", "debleu_0", (2, 2)),
+    ("train_1", "debleu_0", (4, 2)),
+    ("train_1", "debleu_1", (1, 0)),
+]
 
-train_xe = {
+train_xe_0 = {
     "optimizer": {
         "type": "AdamOptimizer",
         "kwargs": {
-            "learning_rate": [1e-3, 1e-5]
+            "learning_rate": 1e-3
         }
     },
     "gradient_clip": {
         "type": "clip_by_global_norm",
         "kwargs": {
             "clip_norm": 5.
-        },
+        }
+    },
+    "name": "XE_0"
+}
+
+train_xe_1 = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": 1e-5
+        }
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        }
     },
-    "name": "XE"
+    "name": "XE_1"
 }
 
-train_debleu = {
+train_debleu_0 = {
     "optimizer": {
         "type": "AdamOptimizer",
         "kwargs": {
-            "learning_rate": 1e-5,
+            "learning_rate": 1e-5
         }
     },
     "gradient_clip": {
         "type": "clip_by_global_norm",
         "kwargs": {
             "clip_norm": 5.
-        },
+        }
+    },
+    "name": "DEBLEU_0"
+}
+
+train_debleu_1 = {
+    "optimizer": {
+        "type": "AdamOptimizer",
+        "kwargs": {
+            "learning_rate": 1e-6
+        }
+    },
+    "gradient_clip": {
+        "type": "clip_by_global_norm",
+        "kwargs": {
+            "clip_norm": 5.
+        }
     },
-    "name": "DEBLEU"
+    "name": "DEBLEU_1"
 }
diff --git a/examples/differentiable_expected_bleu/config_train_4_2.py b/examples/differentiable_expected_bleu/config_train_4_2.py
deleted file mode 100644
index 4c8cc276..00000000
--- a/examples/differentiable_expected_bleu/config_train_4_2.py
+++ /dev/null
@@ -1,41 +0,0 @@
-max_epochs = 1000
-steps_per_eval = 500
-tau = 1.
-infer_beam_width = 1
-infer_max_decoding_length = 50
-
-mask_patterns = [(4, 2), (1, 0)]
-threshold_steps = int(1e9)
-minimum_interval_steps = 10000
-
-train_xe = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": [1e-3, 1e-5]
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "XE"
-}
-
-train_debleu = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 1e-5,
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "DEBLEU"
-}
diff --git a/examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py b/examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py
deleted file mode 100644
index faad17d7..00000000
--- a/examples/differentiable_expected_bleu/config_train_lr1e6_1_0.py
+++ /dev/null
@@ -1,41 +0,0 @@
-max_epochs = 1000
-steps_per_eval = 500
-tau = 1.
-infer_beam_width = 1
-infer_max_decoding_length = 50
-
-mask_patterns = [(1, 0)]
-threshold_steps = int(1e9)
-minimum_interval_steps = 10000
-
-train_xe = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": [1e-3, 1e-5]
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "XE"
-}
-
-train_debleu = {
-    "optimizer": {
-        "type": "AdamOptimizer",
-        "kwargs": {
-            "learning_rate": 1e-6,
-        }
-    },
-    "gradient_clip": {
-        "type": "clip_by_global_norm",
-        "kwargs": {
-            "clip_norm": 5.
-        },
-    },
-    "name": "DEBLEU"
-}
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 17c5d55b..a45e2b83 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -29,34 +29,26 @@
 
 flags = tf.flags
 
-flags.DEFINE_string("config_train", "config_train", "The training config.")
 flags.DEFINE_string("config_model", "config_model_medium", "The model config.")
 flags.DEFINE_string("config_data", "config_data_iwslt14_de-en",
                     "The dataset config.")
+flags.DEFINE_string("config_train", "config_train", "The training config.")
 flags.DEFINE_string("expr_name", "iwslt14_de-en", "The experiment name. "
-                    "Also used as the directory name of run.")
-flags.DEFINE_integer("pretrain_epochs", 10000, "Number of pretraining epochs.")
-flags.DEFINE_string("stage", "xe0", "stage.")
-flags.DEFINE_boolean("reinitialize_optimizer", False, "Whether to reinitialize "
-                     "optimizer state before training.")
+                    "Used as the directory name of run.")
+flags.DEFINE_boolean("reinitialize", True, "Whether to reinitialize the state "
+                     "of the optimizers before training and after triggering.")
 
 FLAGS = flags.FLAGS
 
-config_train = importlib.import_module(FLAGS.config_train)
 config_model = importlib.import_module(FLAGS.config_model)
 config_data = importlib.import_module(FLAGS.config_data)
+config_train = importlib.import_module(FLAGS.config_train)
 expr_name = FLAGS.expr_name
-pretrain_epochs = FLAGS.pretrain_epochs
-stage = FLAGS.stage
-reinitialize_optimizer = FLAGS.reinitialize_optimizer
-mask_patterns = config_train.mask_patterns
-
-d = config_train.train_xe["optimizer"]["kwargs"]
-if stage.startswith("xe"):
-    d["learning_rate"] = d["learning_rate"][int(stage[2:])]
-else:
-    d["learning_rate"] = d["learning_rate"][-1]
+reinitialize = FLAGS.reinitialize
+phases = config_train.phases
 
+xe_names = ('xe_0', 'xe_1')
+debleu_names = ('debleu_0', 'debleu_1')
 
 def get_scope_by_name(tensor):
     return tensor.name[: tensor.name.rfind('/') + 1]
@@ -65,6 +57,8 @@ def get_scope_by_name(tensor):
 def build_model(batch, train_data):
     """Assembles the seq2seq model.
     """
+    train_ops = {}
+
     source_embedder = tx.modules.WordEmbedder(
         vocab_size=train_data.source_vocab.size, hparams=config_model.embedder)
 
@@ -112,9 +106,12 @@ def build_model(batch, train_data):
         logits=tf_outputs.logits,
         sequence_length=batch['target_length']-1)
 
-    train_xe_op = tx.core.get_train_op(
+    train_ops[xe_names[0]] = tx.core.get_train_op(
+        loss_xe,
+        hparams=config_train.train_xe_0)
+    train_ops[xe_names[1]] = tx.core.get_train_op(
         loss_xe,
-        hparams=config_train.train_xe)
+        hparams=config_train.train_xe_1)
 
     # teacher mask + DEBLEU fine-tuning
     tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper(
@@ -122,8 +119,8 @@ def build_model(batch, train_data):
         inputs=batch['target_text_ids'],
         sequence_length=batch['target_length']-1,
         embedding=target_embedder,
-        n_unmask=mask_patterns[0][0],
-        n_mask=mask_patterns[0][1],
+        n_unmask=1,
+        n_mask=0,
         tau=config_train.tau)
 
     tm_outputs, _, _ = decoder(
@@ -135,9 +132,12 @@ def build_model(batch, train_data):
         probs=tm_outputs.sample_id,
         sequence_length=batch['target_length']-1)
 
-    train_debleu_op = tx.core.get_train_op(
+    train_ops[debleu_names[0]] = tx.core.get_train_op(
         loss_debleu,
-        hparams=config_train.train_debleu)
+        hparams=config_train.train_debleu_0)
+    train_ops[debleu_names[1]] = tx.core.get_train_op(
+        loss_debleu,
+        hparams=config_train.train_debleu_1)
 
     # inference: beam search decoding
     start_tokens = tf.ones_like(batch['target_length']) * \
@@ -153,64 +153,78 @@ def build_model(batch, train_data):
         beam_width=config_train.infer_beam_width,
         max_decoding_length=config_train.infer_max_decoding_length)
 
-    return train_xe_op, train_debleu_op, tm_helper, bs_outputs
+    return train_ops, tm_helper, bs_outputs
 
 
 def main():
     """Entrypoint.
     """
-    train_data = tx.data.PairedTextData(hparams=config_data.train)
+    train_0_data = tx.data.PairedTextData(hparams=config_data.train_0)
+    train_1_data = tx.data.PairedTextData(hparams=config_data.train_1)
     val_data = tx.data.PairedTextData(hparams=config_data.val)
     test_data = tx.data.PairedTextData(hparams=config_data.test)
     data_iterator = tx.data.FeedableDataIterator(
-        {'train': train_data, 'val': val_data, 'test': test_data})
-
+        {'train_0': train_0_data, 'train_1': train_1_data,
+         'val': val_data, 'test': test_data})
     data_batch = data_iterator.get_next()
 
     global_step = tf.train.create_global_step()
 
-    train_xe_op, train_debleu_op, tm_helper, infer_outputs = \
-        build_model(data_batch, train_data)
+    train_ops, tm_helper, infer_outputs = build_model(data_batch, train_0_data)
+
+    def get_train_op_scope(name):
+        return get_scope_by_name(train_ops[name])
 
-    train_xe_op_initializer, train_debleu_op_initializer = [
-        tf.variables_initializer(
+    train_op_initializers = {
+        name: tf.variables_initializer(
             tf.get_collection(
                 tf.GraphKeys.GLOBAL_VARIABLES,
-                scope=get_scope_by_name(train_op)),
-            name=name)
-        for train_op, name in [
-            (train_xe_op, "train_xe_op_initializer"),
-            (train_debleu_op, "train_debleu_op_initializer")]]
-
+                scope=get_train_op_scope(name)),
+            name='train_{}_op_initializer'.format(name))
+        for name in (xe_names + debleu_names)}
     tm_helper_initializer = tf.variables_initializer(
         [tm_helper.n_unmask, tm_helper.n_mask], name="tm_helper_initializer")
 
     summary_tm = [
         tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask),
         tf.summary.scalar('tm/n_mask', tm_helper.n_mask)]
-    summary_xe_op = tf.summary.merge(
-        tf.get_collection(
-            tf.GraphKeys.SUMMARIES,
-            scope=get_scope_by_name(train_xe_op)),
-        name='summary_xe')
-    summary_debleu_op = tf.summary.merge(
-        tf.get_collection(
-            tf.GraphKeys.SUMMARIES,
-            scope=get_scope_by_name(train_debleu_op)) + summary_tm,
-        name='summary_debleu')
+    summary_ops = {
+        name: tf.summary.merge(
+            tf.get_collection(
+                tf.GraphKeys.SUMMARIES,
+                scope=get_train_op_scope(name))
+            + (summary_tm if name in debleu_names else []),
+            name='summary_{}'.format(name))
+        for name in (xe_names + debleu_names)}
 
     saver = tf.train.Saver(max_to_keep=None)
 
-    global best_val_bleu
-    best_val_bleu = -1
+    def _restore_from(directory, restore_trigger):
+        if os.path.exists(directory):
+            ckpt_path = tf.train.latest_checkpoint(directory)
+            print('restoring from {} ...'.format(ckpt_path))
+            saver.restore(sess, ckpt_path)
+
+            if restore_trigger:
+                trigger_path = '{}.trigger'.format(ckpt_path)
+                if os.path.exists(trigger_path):
+                    with open(trigger_path, 'rb') as pickle_file:
+                        trigger.restore_from_pickle(pickle_file)
+                else:
+                    print('cannot find previous trigger state.')
+
+            print('done.')
 
-    def _train_epoch(sess, summary_writer, train_op, summary_op, trigger):
+        else:
+            print('cannot find checkpoint directory {}'.format(directory))
+
+    def _train_epoch(sess, summary_writer, mode, train_op, summary_op, trigger):
         print('in _train_epoch')
 
-        data_iterator.restart_dataset(sess, 'train')
+        data_iterator.restart_dataset(sess, mode)
         feed_dict = {
             tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
-            data_iterator.handle: data_iterator.get_handle(sess, 'train')
+            data_iterator.handle: data_iterator.get_handle(sess, mode)
         }
 
         while True:
@@ -221,7 +235,10 @@ def _train_epoch(sess, summary_writer, train_op, summary_op, trigger):
                 summary_writer.add_summary(summary, step)
 
                 if step % config_train.steps_per_eval == 0:
+                    global triggered
                     _eval_epoch(sess, summary_writer, 'val', trigger)
+                    if triggered:
+                        break
 
             except tf.errors.OutOfRangeError:
                 break
@@ -271,41 +288,25 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
 
         if mode == 'val':
             if trigger is not None:
-                triggered, _ = trigger(step, bleu)
-                if triggered:
-                    print('triggered!')
-
-            global best_val_bleu
-            if bleu > best_val_bleu:
-                best_val_bleu = bleu
-                print('update best val bleu: {}'.format(best_val_bleu))
-
-                saved_path = saver.save(
-                    sess, ckpt_best, global_step=step)
+                if (trigger.best_ever_score is not None and
+                        bleu > trigger.best_ever_score):
+                    print('update best val bleu: {}'.format(bleu))
 
-                if stage == 'debleu':
+                    saved_path = saver.save(sess, ckpt_best, global_step=step)
                     with open('{}.trigger'.format(saved_path), 'wb') as \
                             pickle_file:
                         trigger.save_to_pickle(pickle_file)
+                    print('saved to {}'.format(saved_path))
 
-                print('saved to {}'.format(saved_path))
+                global triggered
+                triggered, _ = trigger(step, bleu)
+                if triggered:
+                    print('triggered!')
 
         print('end _eval_epoch')
         return bleu
 
     with tf.Session() as sess:
-        def action_of_mask(mask_pattern):
-            sess.run(train_debleu_op_initializer)
-            tm_helper.assign_mask_pattern(sess, *mask_pattern)
-
-        action = (action_of_mask(mask_pattern)
-                  for mask_pattern in mask_patterns[1:])
-        trigger = tx.utils.BestEverConvergenceTrigger(
-            action,
-            config_train.threshold_steps,
-            config_train.minimum_interval_steps,
-            default=None)
-
         sess.run(tf.global_variables_initializer())
         sess.run(tf.local_variables_initializer())
         sess.run(tf.tables_initializer())
@@ -315,58 +316,62 @@ def action_of_mask(mask_pattern):
         ckpt_model = os.path.join(dir_model, 'model.ckpt')
         ckpt_best = os.path.join(dir_best, 'model.ckpt')
 
-        if os.path.exists(dir_model):
-            ckpt_path = tf.train.latest_checkpoint(dir_model)
-            print('restoring from {} ...'.format(ckpt_path))
-            saver.restore(sess, ckpt_path)
-
-            if reinitialize_optimizer:
-                sess.run(train_xe_op_initializer)
-                sess.run(train_debleu_op_initializer)
-                sess.run(tm_helper_initializer)
-
-            trigger_path = '{}.trigger'.format(ckpt_path)
-            if os.path.exists(trigger_path):
-                with open(trigger_path, 'rb') as pickle_file:
-                    trigger.restore_from_pickle(pickle_file)
-            else:
-                print('cannot find previous trigger state.')
+        def action_before_phase(phase):
+            global train_data_name, train_op_name, mask_pattern,\
+                train_op, summary_op
+            train_data_name, train_op_name, mask_pattern = phase
+            train_op = train_ops[train_op_name]
+            summary_op = summary_ops[train_op_name]
+            if reinitialize:
+                sess.run(train_op_initializers[train_op_name])
+            if mask_pattern is not None:
+                tm_helper.assign_mask_pattern(sess, *mask_pattern)
+
+        action = (action_before_phase(phase) for phase in phases)
+        next(action)
+        trigger = tx.utils.BestEverConvergenceTrigger(
+            action,
+            config_train.threshold_steps,
+            config_train.minimum_interval_steps,
+            default=None)
 
-            print('done.')
+        _restore_from(dir_model, restore_trigger=True)
 
         summary_writer = tf.summary.FileWriter(
             os.path.join(expr_name, 'log'), sess.graph, flush_secs=30)
 
         epoch = 0
         while epoch < config_train.max_epochs:
-            print('epoch #{}{}:'.format(
-                epoch, ' ({})'.format(stage)))
+            print('epoch #{} {}:'.format(
+                epoch, (train_data_name, train_op_name, mask_pattern)))
 
             val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger)
+            if triggered:
+                _restore_from(dir_best, restore_trigger=False)
+
             test_bleu = _eval_epoch(sess, summary_writer, 'test', None)
+
             step = tf.train.global_step(sess, global_step)
+
             print('epoch: {}, step: {}, val BLEU: {}, test BLEU: {}'.format(
                 epoch, step, val_bleu, test_bleu))
 
-            train_op, summary_op, trigger_ = {
-                'xe0': (train_xe_op, summary_xe_op, None),
-                'xe1': (train_xe_op, summary_xe_op, None),
-                'debleu': (train_debleu_op, summary_debleu_op, trigger)
-            }[stage]
-            _train_epoch(sess, summary_writer, train_op, summary_op, trigger_)
+            _train_epoch(sess, summary_writer, train_data_name,
+                         train_op, summary_op, trigger)
+            if triggered:
+                _restore_from(dir_best, restore_trigger=False)
+
             epoch += 1
 
             step = tf.train.global_step(sess, global_step)
             saved_path = saver.save(sess, ckpt_model, global_step=step)
-
-            if stage == 'debleu':
-                with open('{}.trigger'.format(saved_path), 'wb') as pickle_file:
-                    trigger.save_to_pickle(pickle_file)
+            with open('{}.trigger'.format(saved_path), 'wb') as pickle_file:
+                trigger.save_to_pickle(pickle_file)
 
             print('saved to {}'.format(saved_path))
 
         test_bleu = _eval_epoch(sess, summary_writer, 'test', None)
-        print('test BLEU: {}'.format(test_bleu))
+        print('epoch: {}, test BLEU: {}'.format(epoch, test_bleu))
 
 
 if __name__ == '__main__':
diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py
index ce3c7183..ba1ab0b5 100644
--- a/texar/utils/triggers.py
+++ b/texar/utils/triggers.py
@@ -209,6 +209,11 @@ def __call__(self, step, score):
             step (int): Current training step to update. The training step must
                 be updated in ascending order.
             score (float): Current value of the maintained metric.
+
+        Returns:
+            A tuple `(triggered, retval)`, where boolean `triggered` denotes
+            whether triggered this time and `retval` is the return value of the
+            action performed this time.
         """
         return super(BestEverConvergenceTrigger, self).__call__(step, score)
 
@@ -217,6 +222,24 @@ def _state_names(self):
         return super(BestEverConvergenceTrigger, self)._state_names + [
             '_last_triggered_step', '_best_ever_step', '_best_ever_score']
 
+    @property
+    def last_triggered_step(self):
+        """The step at which the Trigger last triggered.
+        """
+        return self._last_triggered_step
+
+    @property
+    def best_ever_step(self):
+        """The step at which the best-ever score is reached.
+        """
+        return self._best_ever_step
+
+    @property
+    def best_ever_score(self):
+        """The best-ever score.
+        """
+        return self._best_ever_score
+
 
 class MovingAverageConvergenceTrigger(Trigger):
 

From 0f157e89abcd334c79e8952e378966e743118924 Mon Sep 17 00:00:00 2001
From: Wentao Wang <wwt10@pku.edu.cn>
Date: Sat, 3 Nov 2018 23:12:03 -0400
Subject: [PATCH 56/65] make mask pattern Tensors and use placeholder

---
 .../differentiable_expected_bleu.py           | 19 ++++++++++---------
 texar/modules/decoders/rnn_decoder_helpers.py | 16 ++--------------
 2 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index a45e2b83..e39ea949 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -114,13 +114,15 @@ def build_model(batch, train_data):
         hparams=config_train.train_xe_1)
 
     # teacher mask + DEBLEU fine-tuning
+    n_unmask = tf.placeholder(tf.int32, shape=[], name="n_unmask")
+    n_mask = tf.placeholder(tf.int32, shape=[], name="n_mask")
     tm_helper = tx.modules.TeacherMaskSoftmaxEmbeddingHelper(
         # must not remove last token, since it may be used as mask
         inputs=batch['target_text_ids'],
         sequence_length=batch['target_length']-1,
         embedding=target_embedder,
-        n_unmask=1,
-        n_mask=0,
+        n_unmask=n_unmask,
+        n_mask=n_mask,
         tau=config_train.tau)
 
     tm_outputs, _, _ = decoder(
@@ -153,7 +155,7 @@ def build_model(batch, train_data):
         beam_width=config_train.infer_beam_width,
         max_decoding_length=config_train.infer_max_decoding_length)
 
-    return train_ops, tm_helper, bs_outputs
+    return train_ops, tm_helper, (n_unmask, n_mask), bs_outputs
 
 
 def main():
@@ -170,7 +172,8 @@ def main():
 
     global_step = tf.train.create_global_step()
 
-    train_ops, tm_helper, infer_outputs = build_model(data_batch, train_0_data)
+    train_ops, tm_helper, mask_pattern_, infer_outputs = build_model(
+        data_batch, train_0_data)
 
     def get_train_op_scope(name):
         return get_scope_by_name(train_ops[name])
@@ -182,8 +185,6 @@ def get_train_op_scope(name):
                 scope=get_train_op_scope(name)),
             name='train_{}_op_initializer'.format(name))
         for name in (xe_names + debleu_names)}
-    tm_helper_initializer = tf.variables_initializer(
-        [tm_helper.n_unmask, tm_helper.n_mask], name="tm_helper_initializer")
 
     summary_tm = [
         tf.summary.scalar('tm/n_unmask', tm_helper.n_unmask),
@@ -224,7 +225,9 @@ def _train_epoch(sess, summary_writer, mode, train_op, summary_op, trigger):
         data_iterator.restart_dataset(sess, mode)
         feed_dict = {
             tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
-            data_iterator.handle: data_iterator.get_handle(sess, mode)
+            data_iterator.handle: data_iterator.get_handle(sess, mode),
+            mask_pattern_[0]: mask_pattern[0],
+            mask_pattern_[1]: mask_pattern[1],
         }
 
         while True:
@@ -324,8 +327,6 @@ def action_before_phase(phase):
             summary_op = summary_ops[train_op_name]
             if reinitialize:
                 sess.run(train_op_initializers[train_op_name])
-            if mask_pattern is not None:
-                tm_helper.assign_mask_pattern(sess, *mask_pattern)
 
         action = (action_before_phase(phase) for phase in phases)
         next(action)
diff --git a/texar/modules/decoders/rnn_decoder_helpers.py b/texar/modules/decoders/rnn_decoder_helpers.py
index 3ff9d419..4af4f874 100644
--- a/texar/modules/decoders/rnn_decoder_helpers.py
+++ b/texar/modules/decoders/rnn_decoder_helpers.py
@@ -410,17 +410,10 @@ def __init__(self, inputs, sequence_length, embedding, n_unmask,
             self._zero_next_inputs = tf.zeros_like(
                 self._embedding_fn(self._zero_inputs))
 
-            self._n_unmask = tf.get_variable(
-                "n_unmask", initializer=n_unmask, trainable=False)
-            self._n_mask = tf.get_variable(
-                "n_mask", initializer=n_mask, trainable=False)
+            self._n_unmask = n_unmask
+            self._n_mask = n_mask
             self._n_cycle = tf.add(
                 self._n_unmask, self._n_mask, name="n_cycle")
-            self._new_n_unmask = tf.placeholder(shape=[], dtype=tf.int32)
-            self._new_n_mask = tf.placeholder(shape=[], dtype=tf.int32)
-            self._assign_n_unmask = tf.assign(
-                self._n_unmask, self._new_n_unmask)
-            self._assign_n_mask = tf.assign(self._n_mask, self._new_n_mask)
             self._n_shift = tf.random_uniform(
                 [], maxval=self._n_cycle, dtype=self._n_cycle.dtype,
                 seed=self._seed, name="n_shift")
@@ -441,11 +434,6 @@ def n_unmask(self):
     def n_mask(self):
         return self._n_mask
 
-    def assign_mask_pattern(self, sess, n_unmask, n_mask):
-        sess.run([self._assign_n_unmask, self._assign_n_mask],
-                 feed_dict={self._new_n_unmask: n_unmask,
-                            self._new_n_mask: n_mask})
-
     def _is_masked(self, time):
         return (time + self._n_shift) % self._n_cycle < self._n_mask
 

From c4c428897e2e587f86db1c59788cdc587b58c219 Mon Sep 17 00:00:00 2001
From: Wentao Wang <wwt10@pku.edu.cn>
Date: Sun, 4 Nov 2018 15:29:01 -0500
Subject: [PATCH 57/65] reconstruct triggers ; modify code

---
 .../differentiable_expected_bleu.py           | 132 ++++++++++--------
 texar/utils/triggers.py                       | 118 ++++++++--------
 2 files changed, 138 insertions(+), 112 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index e39ea949..6c567fe3 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Attentional Seq2seq.
+"""DEBLEU.
 """
 from __future__ import absolute_import
 from __future__ import print_function
@@ -50,6 +50,12 @@
 xe_names = ('xe_0', 'xe_1')
 debleu_names = ('debleu_0', 'debleu_1')
 
+dir_model = os.path.join(expr_name, 'ckpt')
+dir_best = os.path.join(expr_name, 'ckpt-best')
+ckpt_model = os.path.join(dir_model, 'model.ckpt')
+ckpt_best = os.path.join(dir_best, 'model.ckpt')
+
+
 def get_scope_by_name(tensor):
     return tensor.name[: tensor.name.rfind('/') + 1]
 
@@ -198,37 +204,60 @@ def get_train_op_scope(name):
             name='summary_{}'.format(name))
         for name in (xe_names + debleu_names)}
 
+    global convergence_trigger
+    convergence_trigger = tx.utils.BestEverConvergenceTrigger(
+        None,
+        lambda state: state,
+        config_train.threshold_steps,
+        config_train.minimum_interval_steps)
+
     saver = tf.train.Saver(max_to_keep=None)
 
-    def _restore_from(directory, restore_trigger):
+    def _save_to(directory, step):
+        print('saving to {} ...'.format(directory))
+        saved_path = saver.save(sess, directory, global_step=step)
+
+        for trigger_name in ['convergence_trigger', 'annealing_trigger']:
+            trigger = globals()[trigger_name]
+            trigger_path = '{}.{}'.format(saved_path, trigger_name)
+            print('saving {} ...'.format(trigger_name))
+            with open(trigger_path, 'wb') as pickle_file:
+                trigger.save_to_pickle(pickle_file)
+
+        print('saved to {}'.format(saved_path))
+
+    def _restore_from(directory, restore_trigger_names):
         if os.path.exists(directory):
             ckpt_path = tf.train.latest_checkpoint(directory)
             print('restoring from {} ...'.format(ckpt_path))
             saver.restore(sess, ckpt_path)
 
-            if restore_trigger:
-                trigger_path = '{}.trigger'.format(ckpt_path)
+            for trigger_name in restore_trigger_names:
+                trigger = globals()[trigger_name]
+                trigger_path = '{}.{}'.format(ckpt_path, trigger_name)
                 if os.path.exists(trigger_path):
+                    print('restoring {} ...'.format(trigger_name))
                     with open(trigger_path, 'rb') as pickle_file:
                         trigger.restore_from_pickle(pickle_file)
                 else:
-                    print('cannot find previous trigger state.')
+                    print('cannot find previous {} state.'.format(trigger_name))
 
             print('done.')
 
         else:
             print('cannot find checkpoint directory {}'.format(directory))
 
-    def _train_epoch(sess, summary_writer, mode, train_op, summary_op, trigger):
+    def _train_epoch(sess, summary_writer, mode, train_op, summary_op):
         print('in _train_epoch')
 
         data_iterator.restart_dataset(sess, mode)
         feed_dict = {
             tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
             data_iterator.handle: data_iterator.get_handle(sess, mode),
-            mask_pattern_[0]: mask_pattern[0],
-            mask_pattern_[1]: mask_pattern[1],
         }
+        if mask_pattern is not None:
+            feed_dict.update(
+                {mask_pattern_[_]: mask_pattern[_] for _ in range(2)})
 
         while True:
             try:
@@ -239,7 +268,7 @@ def _train_epoch(sess, summary_writer, mode, train_op, summary_op, trigger):
 
                 if step % config_train.steps_per_eval == 0:
                     global triggered
-                    _eval_epoch(sess, summary_writer, 'val', trigger)
+                    _eval_epoch(sess, summary_writer, 'val')
                     if triggered:
                         break
 
@@ -248,7 +277,7 @@ def _train_epoch(sess, summary_writer, mode, train_op, summary_op, trigger):
 
         print('end _train_epoch')
 
-    def _eval_epoch(sess, summary_writer, mode, trigger):
+    def _eval_epoch(sess, summary_writer, mode):
         print('in _eval_epoch with mode {}'.format(mode))
 
         data_iterator.restart_dataset(sess, mode)
@@ -290,21 +319,16 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
         summary_writer.flush()
 
         if mode == 'val':
-            if trigger is not None:
-                if (trigger.best_ever_score is not None and
-                        bleu > trigger.best_ever_score):
-                    print('update best val bleu: {}'.format(bleu))
-
-                    saved_path = saver.save(sess, ckpt_best, global_step=step)
-                    with open('{}.trigger'.format(saved_path), 'wb') as \
-                            pickle_file:
-                        trigger.save_to_pickle(pickle_file)
-                    print('saved to {}'.format(saved_path))
-
-                global triggered
-                triggered, _ = trigger(step, bleu)
-                if triggered:
-                    print('triggered!')
+            global triggered
+            triggered = convergence_trigger(step, bleu)
+            if triggered:
+                print('triggered!')
+
+            if convergence_trigger.best_ever_step == step:
+                print('updated best val bleu: {}'.format(
+                    convergence_trigger.best_ever_score))
+
+                _save_to(ckpt_best, step)
 
         print('end _eval_epoch')
         return bleu
@@ -314,43 +338,42 @@ def _eval_epoch(sess, summary_writer, mode, trigger):
         sess.run(tf.local_variables_initializer())
         sess.run(tf.tables_initializer())
 
-        dir_model = os.path.join(expr_name, 'ckpt')
-        dir_best = os.path.join(expr_name, 'ckpt-best')
-        ckpt_model = os.path.join(dir_model, 'model.ckpt')
-        ckpt_best = os.path.join(dir_best, 'model.ckpt')
-
-        def action_before_phase(phase):
-            global train_data_name, train_op_name, mask_pattern,\
-                train_op, summary_op
-            train_data_name, train_op_name, mask_pattern = phase
-            train_op = train_ops[train_op_name]
-            summary_op = summary_ops[train_op_name]
+        def action(i):
+            if i >= len(phases):
+                return i
+            i += 1
+            train_data_name, train_op_name, mask_pattern = phases[i]
             if reinitialize:
                 sess.run(train_op_initializers[train_op_name])
+            return i
 
-        action = (action_before_phase(phase) for phase in phases)
-        next(action)
-        trigger = tx.utils.BestEverConvergenceTrigger(
-            action,
-            config_train.threshold_steps,
-            config_train.minimum_interval_steps,
-            default=None)
+        global annealing_trigger
+        annealing_trigger = tx.utils.Trigger(0, action)
 
-        _restore_from(dir_model, restore_trigger=True)
+        def _restore_and_anneal():
+            _restore_from(dir_best, ['convergence_trigger'])
+            annealing_trigger.trigger()
+
+        _restore_from(dir_model, ['convergence_trigger', 'annealing_trigger'])
 
         summary_writer = tf.summary.FileWriter(
             os.path.join(expr_name, 'log'), sess.graph, flush_secs=30)
 
         epoch = 0
         while epoch < config_train.max_epochs:
+            train_data_name, train_op_name, mask_pattern = phases[
+                annealing_trigger.user_state]
+            train_op = train_ops[train_op_name]
+            summary_op = summary_ops[train_op_name]
+
             print('epoch #{} {}:'.format(
                 epoch, (train_data_name, train_op_name, mask_pattern)))
 
-            val_bleu = _eval_epoch(sess, summary_writer, 'val', trigger)
+            val_bleu = _eval_epoch(sess, summary_writer, 'val')
+            test_bleu = _eval_epoch(sess, summary_writer, 'test')
             if triggered:
-                _restore_from(dir_best, restore_trigger=False)
-
-            test_bleu = _eval_epoch(sess, summary_writer, 'test', None)
+                _restore_and_anneal()
+                continue
 
             step = tf.train.global_step(sess, global_step)
 
@@ -358,20 +381,17 @@ def action_before_phase(phase):
                 epoch, step, val_bleu, test_bleu))
 
             _train_epoch(sess, summary_writer, train_data_name,
-                         train_op, summary_op, trigger)
+                         train_op, summary_op)
             if triggered:
-                _restore_from(dir_best, restore_trigger=False)
+                _restore_and_anneal()
+                continue
 
             epoch += 1
 
             step = tf.train.global_step(sess, global_step)
-            saved_path = saver.save(sess, ckpt_model, global_step=step)
-            with open('{}.trigger'.format(saved_path), 'wb') as pickle_file:
-                trigger.save_to_pickle(pickle_file)
-
-            print('saved to {}'.format(saved_path))
+            _save_to(ckpt_model, step)
 
-        test_bleu = _eval_epoch(sess, summary_writer, 'test', None)
+        test_bleu = _eval_epoch(sess, summary_writer, 'test')
         print('epoch: {}, test BLEU: {}'.format(epoch, test_bleu))
 
 
diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py
index ba1ab0b5..a4128ba2 100644
--- a/texar/utils/triggers.py
+++ b/texar/utils/triggers.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Attentional Seq2seq.
+"""Triggers.
 """
 from __future__ import absolute_import
 from __future__ import print_function
@@ -33,48 +33,52 @@
 ]
 
 
-DEFAULT_ACTION = object()
-
-
 class Trigger(object):
-    """This is the base class of all triggers. A trigger can do some action when
-    certain condition is met. Specifically, the user calls the trigger
-    periodically. Every time the trigger is called, it will send all arguments
-    to :meth:`_predicate`, which returns a boolean value indicates whether the
-    condition is met. Once the condition is met, the trigger will then call
-    `next(action)` to do next action and obtain the returned value.
+    """This is the base class of all triggers. A trigger maintains some
+    user-defined :attr:`user_state` and does some :attr:`action` when certain
+    condition is met. Specifically, the user calls the trigger periodically.
+    Every time the trigger is called, it will send all arguments to
+    :meth:`_predicate`, which returns a boolean value indicates whether the
+    condition is met. Once the condition is met, the trigger will then execute
+    `user_state = action(user_state)` to update the :attr:`user_state`.
+    :attr:`user_state` should completely define the current state of the
+    trigger, and, therefore, enables saving and restoring :attr:`user_state`.
+    It is the user's responsibility to keep :attr:`action` away from any
+    possible corruption of restored state.
 
     Args:
-        action (iterable): An iterable which iteratively does the action and
-            possibly returns a value.
-        default (optional): The value returned after :attr:`action` exhausted.
-            If not provided, the trigger will do nothing when `StopIteration`
-            occurs.
+        initial_user_state: A (any kind of picklable) object representing the
+            initial :attr:`user_state`.
+        action (function): A function which is called to update
+            :attr:`user_state` every time the trigger is triggered. See above
+            for detailed explanation.
+    .. document private functions
+    .. automethod:: __call__
     """
 
-    def __init__(self, action, default=DEFAULT_ACTION):
-        self._action = iter(action)
-        self._default = default
-        self._triggered_times = 0
+    def __init__(self, initial_user_state, action):
+        self._user_state = initial_user_state
+        self._action = action
 
     def _predicate(self, *args, **kwargs):
-        """This function returns True when the condition is met and we should
-        do something.
+        """Returns True when the condition is met and we should do something.
         """
         raise NotImplementedError
 
-    def _next_action(self):
-        return next(self._action) if self._default is DEFAULT_ACTION else \
-               next(self._action, self._default)
+    def trigger(self):
+        """Executes `user_state = action(user_state)`. User can manually call
+        this method to trigger it.
+        """
+        self._user_state = self._action(self._user_state)
 
     def __call__(self, *args, **kwargs):
+        """The trigger must be called to update the internal state and
+        automatically triggers when the condition is found met.
+        """
         pred = self._predicate(*args, **kwargs)
         if pred:
-            ret = self._next_action()
-            self._triggered_times += 1
-        else:
-            ret = None
-        return pred, ret
+            self.trigger()
+        return pred
 
     def _make_state(self, names):
         return {name: getattr(self, name) for name in names}
@@ -84,31 +88,31 @@ def _state_names(self):
         """Returns a list of names of attributes of the trigger object that can
         be saved and restored as trigger state.
         """
-        return ['_triggered_times']
+        return ['_user_state']
 
     @property
     def state(self):
         """The current state which can be used to save and restore the trigger.
-        The state records how many times `next(action)` has been called.
+        The state is consisted of the internal state used to determine whether
+        the condition is met, and the user-defined :attr:`user_state`.
         """
         return self._make_state(self._state_names)
 
+    @property
+    def user_state(self):
+        """The user-defined :attr:`user_state`.
+        """
+        return self._user_state
+
     def restore_from_state(self, state):
-        """Restore the trigger state from the previous stored state.
-        Note that this function will call `next(action)` for the exact times
-        that the :py:attr:`state` records how many times `next(action)` had
-        been called. The user should be aware of any possible side effect of
-        this behavior.
+        """Restore the trigger state from the previous saved state.
 
         Args:
-            state: The state previously obtained by :py:attr:`state`.
+            state: The state previously obtained by :attr:`state`.
         """
         for name, value in state.items():
             setattr(self, name, value)
 
-        for t in range(self._triggered_times):
-            self._next_action()
-
     def save_to_pickle(self, file):
         """Write a pickled representation of the state of the trigger to the
         open file-like object :attr:`file`.
@@ -123,10 +127,6 @@ def save_to_pickle(self, file):
     def restore_from_pickle(self, file):
         """Read a string from the open file-like object :attr:`file` and
         restore the trigger state from it.
-        Note that this function will call `next(action)` for the exact times
-        that the :py:attr:`state` records how many times `next(action)` had
-        been called. The user should be aware of any possible side effect of
-        this behavior.
 
         Args:
             file: The open file-like object from which we read. As described in
@@ -138,12 +138,14 @@ def restore_from_pickle(self, file):
 
 
 class ScheduledStepsTrigger(Trigger):
+    """A trigger that triggers at designated steps.
+    """
     
-    def __init__(self, action, steps, default=DEFAULT_ACTION):
-        """steps should be in increasing order.
+    def __init__(self, initial_user_state, action, steps):
+        """steps should be a list or tuple in increasing order.
         """
-        super(ScheduledTrigger, self).__init__(action, default)
-        self._steps = iter(steps)
+        super(ScheduledTrigger, self).__init__(initial_user_state, action)
+        self._steps = steps
         self._advance_steps()
 
     def _advance_steps(self):
@@ -165,8 +167,10 @@ class BestEverConvergenceTrigger(Trigger):
     triggers.
 
     Args:
-        action (iterable): An iterable which iteratively does the action and
-            possibly returns a value.
+        initial_user_state: A (any kind of picklable) object representing the
+            initial :attr:`user_state`.
+        action (function): A function which is called to update
+            :attr:`user_state` every time the trigger is triggered.
         threshold_steps (int): Number of steps it should trigger after the best
             value was last updated.
         minimum_interval_steps (int): Minimum number of steps between twice
@@ -178,9 +182,10 @@ class BestEverConvergenceTrigger(Trigger):
     .. automethod:: __call__
     """
 
-    def __init__(self, action, threshold_steps, minimum_interval_steps,
-                 default=DEFAULT_ACTION):
-        super(BestEverConvergenceTrigger, self).__init__(action, default)
+    def __init__(self, initial_user_state, action, threshold_steps,
+                 minimum_interval_steps):
+        super(BestEverConvergenceTrigger, self).__init__(
+            initial_user_state, action)
         self._threshold_steps = threshold_steps
         self._minimum_interval_steps = minimum_interval_steps
         self._last_triggered_step = None
@@ -243,9 +248,10 @@ def best_ever_score(self):
 
 class MovingAverageConvergenceTrigger(Trigger):
 
-    def __init__(self, action, n, threshold, minimum_interval_steps,
-                 default=DEFAULT_ACTION):
-        super(MovingAverageConvergenceTrigger, self).__init__(action, default)
+    def __init__(self, initial_user_state, action, n, threshold,
+                 minimum_interval_steps):
+        super(MovingAverageConvergenceTrigger, self).__init__(
+            initial_user_state, action)
         self._n = n
         self._threshold = threshold
         self._minimum_interval_steps = minimum_interval_steps

From 2b1fe5a325b0028e5043f25ad3348fcc937e9513 Mon Sep 17 00:00:00 2001
From: Wentao Wang <wwt10@pku.edu.cn>
Date: Mon, 5 Nov 2018 03:54:05 +0000
Subject: [PATCH 58/65] add test units for triggers

---
 texar/utils/triggers_test.py | 68 ++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 texar/utils/triggers_test.py

diff --git a/texar/utils/triggers_test.py b/texar/utils/triggers_test.py
new file mode 100644
index 00000000..a3f88a20
--- /dev/null
+++ b/texar/utils/triggers_test.py
@@ -0,0 +1,68 @@
+"""
+Unit tests for triggers.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import random
+
+from texar.utils.triggers import Trigger, BestEverConvergenceTrigger
+
+
+class TriggerTest(tf.test.TestCase):
+    """Tests :class:`~texar.utils.Trigger`.
+    """
+
+    def test(self):
+        trigger = Trigger(0, lambda x: x+1)
+        for step in range(100):
+            trigger.trigger()
+            self.assertEqual(trigger.user_state, step+1)
+
+class BestEverConvergenceTriggerTest(tf.test.TestCase):
+    """Tests :class:`~texar.utils.BestEverConvergenceTrigger`.
+    """
+
+    def test(self):
+        for i in range(100):
+            n = random.randint(1, 100)
+            seq = list(range(n))
+            random.shuffle(seq)
+            threshold_steps = random.randint(0, n // 2 + 1)
+            minimum_interval_steps = random.randint(0, n // 2 + 1)
+            trigger = BestEverConvergenceTrigger(
+                0, lambda x: x+1, threshold_steps, minimum_interval_steps)
+
+            best_ever_step, best_ever_score, last_triggered_step = -1, -1, None
+
+            for step, score in enumerate(seq):
+                if score > best_ever_score:
+                    best_ever_step = step
+                    best_ever_score = score
+
+                triggered_ = step - best_ever_step >= threshold_steps and \
+                    (last_triggered_step is None or
+                     step - last_triggered_step >= minimum_interval_steps)
+                if triggered_:
+                    last_triggered_step = step
+
+                triggered = trigger(step, score)
+
+                self.assertEqual(trigger.best_ever_step, best_ever_step)
+                self.assertEqual(trigger.best_ever_score, best_ever_score)
+                self.assertEqual(trigger.last_triggered_step,
+                                 last_triggered_step)
+                self.assertEqual(triggered, triggered_)
+
+        trigger = BestEverConvergenceTrigger(0, lambda x: x+1, 0, 0)
+        for step in range(100):
+            trigger.trigger()
+            self.assertEqual(trigger.user_state, step+1)
+ 
+
+if __name__ == "__main__":
+    tf.test.main()
+

From ec20a9eac5e01fc1582b5845dd60c473dedefdaf Mon Sep 17 00:00:00 2001
From: Wentao Wang <wwt10@pku.edu.cn>
Date: Mon, 5 Nov 2018 05:49:03 +0000
Subject: [PATCH 59/65] rewrite ScheduledStepsTrigger; correct and refine some
 docs  TODO: 1. test ScheduledStepsTrigger; 2. test docs.

---
 texar/utils/triggers.py | 79 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 67 insertions(+), 12 deletions(-)

diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py
index a4128ba2..f209f377 100644
--- a/texar/utils/triggers.py
+++ b/texar/utils/triggers.py
@@ -29,6 +29,7 @@
 
 __all__ = [
     "Trigger",
+    "ScheduledStepsTrigger",
     "BestEverConvergenceTrigger",
 ]
 
@@ -58,6 +59,8 @@ class Trigger(object):
 
     def __init__(self, initial_user_state, action):
         self._user_state = initial_user_state
+        if not callable(action):
+            raise ValueError("Action {} is not callable".format(action))
         self._action = action
 
     def _predicate(self, *args, **kwargs):
@@ -74,6 +77,9 @@ def trigger(self):
     def __call__(self, *args, **kwargs):
         """The trigger must be called to update the internal state and
         automatically triggers when the condition is found met.
+
+        Returns:
+            A boolean denotes whether triggered this time.
         """
         pred = self._predicate(*args, **kwargs)
         if pred:
@@ -138,7 +144,31 @@ def restore_from_pickle(self, file):
 
 
 class ScheduledStepsTrigger(Trigger):
-    """A trigger that triggers at designated steps.
+    """A trigger that triggers after the training step have iterated over some
+    user-designated steps. This means that it will trigger if there is at least
+    one `step` in user-designated set of :attr:`steps` within the range
+    `(last_called_step, current_step]`.
+
+    Args:
+        initial_user_state: A (any kind of picklable) object representing the
+            initial :attr:`user_state`.
+        action (function): A function which is called to update
+            :attr:`user_state` every time the trigger is triggered.
+        steps (list, tuple, or callable): Represents the user-designated set of
+        :attr:`steps` described above. There are **2 ways** provided to specify
+        this set:
+
+        1.  :attr:`steps` is a callable. When calling
+            `steps(last_called_step, current_step)`, it is assumed to return
+            a boolean indicating whether there is at least one `step` in the set
+            within the range `(last_called_step, current_step]`. For example,
+            :code:`steps = lambda l, r: l // n != r // n` denotes the set
+            `{i * n for any positive integer i}` where `n` is some positive
+            integer. This option enables user to define any set of steps, even
+            an infinite set.
+
+        2.  :attr:`steps` is a `list` or `tuple` containing numbers in ascending
+            order. These numbers compose the whole set.
     """
     
     def __init__(self, initial_user_state, action, steps):
@@ -146,17 +176,44 @@ def __init__(self, initial_user_state, action, steps):
         """
         super(ScheduledTrigger, self).__init__(initial_user_state, action)
         self._steps = steps
-        self._advance_steps()
 
-    def _advance_steps(self):
-        self._next_step = next(step, None)
+        if callable(self._steps):
+            self._last_called_step = None
+
+        else:
+            self._index = 0
+
+    @property
+    def _state_names(self):
+        return super(ScheduledStepsTrigger, self)._state_names + [
+            '_last_called_step' if callable(self._steps) else '_index']
 
     def _predicate(self, step):
-        while self._next_step is not None and step < self._next_step:
-            self._advance_steps()
-        if self._next_step is not None and step == self._next_step:
-            return True
-        return False
+        if callable(self._steps):
+            ret = self._steps(self._last_called_step, step)
+            self._last_call_step = step
+
+        else:
+            ret = False
+            while self._index < len(self._steps) and \
+                    self._steps[self._index] <= step:
+                ret = True
+                self._index += 1
+
+        return ret
+
+    def __call__(self, step):
+        """The trigger must be called to update the current training step
+        (:attr:`step`).
+
+        Args:
+            step (int): Current training step to update. The training step must
+                be updated in ascending order.
+
+        Returns:
+            A boolean denotes whether triggered this time.
+        """
+        return super(ScheduledStepsTrigger, self).__call__(step)
 
 
 class BestEverConvergenceTrigger(Trigger):
@@ -216,9 +273,7 @@ def __call__(self, step, score):
             score (float): Current value of the maintained metric.
 
         Returns:
-            A tuple `(triggered, retval)`, where boolean `triggered` denotes
-            whether triggered this time and `retval` is the return value of the
-            action performed this time.
+            A boolean denotes whether triggered this time.
         """
         return super(BestEverConvergenceTrigger, self).__call__(step, score)
 

From ad56c3effb17bfc8a279afb9e46a7df6df6c571e Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Mon, 5 Nov 2018 11:27:09 -0500
Subject: [PATCH 60/65] fix final annealing bug

---
 .../differentiable_expected_bleu.py                             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 6c567fe3..218e0b3b 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -339,7 +339,7 @@ def _eval_epoch(sess, summary_writer, mode):
         sess.run(tf.tables_initializer())
 
         def action(i):
-            if i >= len(phases):
+            if i >= len(phases) - 1:
                 return i
             i += 1
             train_data_name, train_op_name, mask_pattern = phases[i]

From 1f3e21278c21336a1f21cdb533caaa30123d074f Mon Sep 17 00:00:00 2001
From: Zichao Yang <wwt10@pku.edu.cn>
Date: Mon, 5 Nov 2018 14:08:41 -0500
Subject: [PATCH 61/65] add config restore_from

---
 .../differentiable_expected_bleu.py           | 45 ++++++++++++-------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index 218e0b3b..ac6573b0 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -35,6 +35,9 @@
 flags.DEFINE_string("config_train", "config_train", "The training config.")
 flags.DEFINE_string("expr_name", "iwslt14_de-en", "The experiment name. "
                     "Used as the directory name of run.")
+flags.DEFINE_string("restore_from", "", "The specific checkpoint path to "
+                    "restore from. If not specified, the latest checkpoint in "
+                    "expr_name is restored.")
 flags.DEFINE_boolean("reinitialize", True, "Whether to reinitialize the state "
                      "of the optimizers before training and after triggering.")
 
@@ -44,6 +47,7 @@
 config_data = importlib.import_module(FLAGS.config_data)
 config_train = importlib.import_module(FLAGS.config_train)
 expr_name = FLAGS.expr_name
+restore_from = FLAGS.restore_from
 reinitialize = FLAGS.reinitialize
 phases = config_train.phases
 
@@ -226,23 +230,29 @@ def _save_to(directory, step):
 
         print('saved to {}'.format(saved_path))
 
-    def _restore_from(directory, restore_trigger_names):
-        if os.path.exists(directory):
-            ckpt_path = tf.train.latest_checkpoint(directory)
-            print('restoring from {} ...'.format(ckpt_path))
-            saver.restore(sess, ckpt_path)
+    def _restore_from_path(ckpt_path, restore_trigger_names=None):
+        print('restoring from {} ...'.format(ckpt_path))
+        saver.restore(sess, ckpt_path)
+
+        if restore_trigger_names is None:
+            restore_trigger_names = ['convergence_trigger', 'annealing_trigger']
+
+        for trigger_name in restore_trigger_names:
+            trigger = globals()[trigger_name]
+            trigger_path = '{}.{}'.format(ckpt_path, trigger_name)
+            if os.path.exists(trigger_path):
+                print('restoring {} ...'.format(trigger_name))
+                with open(trigger_path, 'rb') as pickle_file:
+                    trigger.restore_from_pickle(pickle_file)
+            else:
+                print('cannot find previous {} state.'.format(trigger_name))
 
-            for trigger_name in restore_trigger_names:
-                trigger = globals()[trigger_name]
-                trigger_path = '{}.{}'.format(ckpt_path, trigger_name)
-                if os.path.exists(trigger_path):
-                    print('restoring {} ...'.format(trigger_name))
-                    with open(trigger_path, 'rb') as pickle_file:
-                        trigger.restore_from_pickle(pickle_file)
-                else:
-                    print('cannot find previous {} state.'.format(trigger_name))
+        print('done.')
 
-            print('done.')
+    def _restore_from(directory, restore_trigger_names=None):
+        if os.path.exists(directory):
+            ckpt_path = tf.train.latest_checkpoint(directory)
+            _restore_from_path(ckpt_path, restore_trigger_names)
 
         else:
             print('cannot find checkpoint directory {}'.format(directory))
@@ -354,7 +364,10 @@ def _restore_and_anneal():
             _restore_from(dir_best, ['convergence_trigger'])
             annealing_trigger.trigger()
 
-        _restore_from(dir_model, ['convergence_trigger', 'annealing_trigger'])
+        if restore_from:
+            _restore_from_path(restore_from)
+        else:
+            _restore_from(dir_model)
 
         summary_writer = tf.summary.FileWriter(
             os.path.join(expr_name, 'log'), sess.graph, flush_secs=30)

From 8988209feee68073431ee9fbe63cb857b9cd2ca3 Mon Sep 17 00:00:00 2001
From: Wentao Wang <wwt10@pku.edu.cn>
Date: Tue, 6 Nov 2018 02:10:32 +0000
Subject: [PATCH 62/65] add test units for ScheduledStepsTrigger and fix some
 bugs

---
 texar/utils/triggers.py      | 23 ++++++++++----
 texar/utils/triggers_test.py | 59 +++++++++++++++++++++++++++++++++++-
 2 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py
index f209f377..1553393b 100644
--- a/texar/utils/triggers.py
+++ b/texar/utils/triggers.py
@@ -165,16 +165,17 @@ class ScheduledStepsTrigger(Trigger):
             :code:`steps = lambda l, r: l // n != r // n` denotes the set
             `{i * n for any positive integer i}` where `n` is some positive
             integer. This option enables user to define any set of steps, even
-            an infinite set.
+            an infinite set. Note that in this case the trigger will never
+            trigger when being called for the first time, because
+            `last_called_step` is undefined at this time. User can manually call
+            it to specify an initial step before training.
 
         2.  :attr:`steps` is a `list` or `tuple` containing numbers in ascending
             order. These numbers compose the whole set.
     """
     
     def __init__(self, initial_user_state, action, steps):
-        """steps should be a list or tuple in increasing order.
-        """
-        super(ScheduledTrigger, self).__init__(initial_user_state, action)
+        super(ScheduledStepsTrigger, self).__init__(initial_user_state, action)
         self._steps = steps
 
         if callable(self._steps):
@@ -188,10 +189,20 @@ def _state_names(self):
         return super(ScheduledStepsTrigger, self)._state_names + [
             '_last_called_step' if callable(self._steps) else '_index']
 
+    @property
+    def last_called_step(self):
+        """The step when the trigger is latest called.
+        """
+        return self._last_called_step
+
     def _predicate(self, step):
         if callable(self._steps):
-            ret = self._steps(self._last_called_step, step)
-            self._last_call_step = step
+            if self._last_called_step is not None:
+                ret = self._steps(self._last_called_step, step)
+            else:
+                ret = False
+
+            self._last_called_step = step
 
         else:
             ret = False
diff --git a/texar/utils/triggers_test.py b/texar/utils/triggers_test.py
index a3f88a20..979b95ed 100644
--- a/texar/utils/triggers_test.py
+++ b/texar/utils/triggers_test.py
@@ -8,8 +8,9 @@
 
 import tensorflow as tf
 import random
+import bisect
 
-from texar.utils.triggers import Trigger, BestEverConvergenceTrigger
+from texar.utils.triggers import *
 
 
 class TriggerTest(tf.test.TestCase):
@@ -22,6 +23,62 @@ def test(self):
             trigger.trigger()
             self.assertEqual(trigger.user_state, step+1)
 
+
+class ScheduledStepsTriggerTest(tf.test.TestCase):
+    """Tests :class:`~texar.utils.ScheduledStepsTrigger`.
+    """
+
+    def test(self):
+        for i in range(100):
+            n = random.randint(1, 100)
+            m = random.randint(1, n)
+            p = random.uniform(0, 0.3)
+            f = lambda l, r: l // n != r // n
+            trigger = ScheduledStepsTrigger(0, lambda x: x+1, f)
+
+            last_called_step = None
+
+            for step in range(n):
+                if random.random() < p:
+                    if last_called_step is not None:
+                        triggered_ = f(last_called_step, step)
+                    else:
+                        triggered_ = False
+
+                    last_called_step = step
+
+                    triggered = trigger(step)
+
+                    self.assertEqual(trigger.last_called_step, last_called_step)
+                    self.assertEqual(triggered, triggered_)
+
+        for i in range(100):
+            n = random.randint(1, 100)
+            m = random.randint(1, n)
+            p = random.uniform(0, 0.3)
+            q = random.uniform(0, 0.3)
+            steps = [step for step in range(n) if random.random() < q]
+            f = lambda l, r: bisect.bisect_right(steps, l) < \
+                             bisect.bisect_right(steps, r)
+            trigger = ScheduledStepsTrigger(0, lambda x: x+1, steps)
+
+            last_called_step = -1
+
+            for step in range(n):
+                if random.random() < p:
+                    triggered_ = f(last_called_step, step)
+                    last_called_step = step
+
+                    triggered = trigger(step)
+
+                    self.assertEqual(triggered, triggered_)
+
+        trigger = ScheduledStepsTrigger(0, lambda x: x+1, [])
+        for step in range(100):
+            trigger.trigger()
+            self.assertEqual(trigger.user_state, step+1)
+
+
 class BestEverConvergenceTriggerTest(tf.test.TestCase):
     """Tests :class:`~texar.utils.BestEverConvergenceTrigger`.
     """

From 5851220137cc060d83c6357a501cdcff0b47b9a3 Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Mon, 5 Nov 2018 21:37:43 -0500
Subject: [PATCH 63/65] fix docs for triggers

---
 docs/code/utils.rst     |  5 +++++
 texar/utils/triggers.py | 48 ++++++++++++++++++++---------------------
 2 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/docs/code/utils.rst b/docs/code/utils.rst
index c463c752..726d4739 100644
--- a/docs/code/utils.rst
+++ b/docs/code/utils.rst
@@ -287,6 +287,11 @@ Trigger
 .. autoclass:: texar.utils.Trigger
     :members:
 
+:hidden:`ScheduledStepsTrigger`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: texar.utils.ScheduledStepsTrigger
+    :members:
+
 :hidden:`BestEverConvergenceTrigger`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: texar.utils.BestEverConvergenceTrigger
diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py
index 1553393b..ee50c341 100644
--- a/texar/utils/triggers.py
+++ b/texar/utils/triggers.py
@@ -50,7 +50,7 @@ class Trigger(object):
     Args:
         initial_user_state: A (any kind of picklable) object representing the
             initial :attr:`user_state`.
-        action (function): A function which is called to update
+        action (callable): A callable which is called to update
             :attr:`user_state` every time the trigger is triggered. See above
             for detailed explanation.
     .. document private functions
@@ -149,29 +149,32 @@ class ScheduledStepsTrigger(Trigger):
     one `step` in user-designated set of :attr:`steps` within the range
     `(last_called_step, current_step]`.
 
+    There are **2 ways** provided to specify the set of :attr:`steps`:
+
+    1.  :attr:`steps` is a callable. When calling
+        `steps(last_called_step, current_step)`, it is assumed to return
+        a boolean indicating whether there is at least one `step` in the set
+        within the range `(last_called_step, current_step]`. For example,
+        :code:`steps = lambda l, r: l // n != r // n` denotes the set
+        `{i * n for any integer i}` where `n` is some integer. This option
+        enables user to define any set of steps, even an infinite set. Note
+        that in this case the trigger will never trigger when being called
+        for the first time, because `last_called_step` is undefined at this
+        time. User can manually call it to specify an initial step before
+        training.
+
+    2.  :attr:`steps` is a `list` or `tuple` containing numbers in ascending
+        order. These numbers compose the whole set.
+
     Args:
         initial_user_state: A (any kind of picklable) object representing the
             initial :attr:`user_state`.
-        action (function): A function which is called to update
+        action (callable): A callable which is called to update
             :attr:`user_state` every time the trigger is triggered.
         steps (list, tuple, or callable): Represents the user-designated set of
-        :attr:`steps` described above. There are **2 ways** provided to specify
-        this set:
-
-        1.  :attr:`steps` is a callable. When calling
-            `steps(last_called_step, current_step)`, it is assumed to return
-            a boolean indicating whether there is at least one `step` in the set
-            within the range `(last_called_step, current_step]`. For example,
-            :code:`steps = lambda l, r: l // n != r // n` denotes the set
-            `{i * n for any positive integer i}` where `n` is some positive
-            integer. This option enables user to define any set of steps, even
-            an infinite set. Note that in this case the trigger will never
-            trigger when being called for the first time, because
-            `last_called_step` is undefined at this time. User can manually call
-            it to specify an initial step before training.
-
-        2.  :attr:`steps` is a `list` or `tuple` containing numbers in ascending
-            order. These numbers compose the whole set.
+            :attr:`steps` described above.
+    .. document private functions
+    .. automethod:: __call__
     """
     
     def __init__(self, initial_user_state, action, steps):
@@ -237,15 +240,12 @@ class BestEverConvergenceTrigger(Trigger):
     Args:
         initial_user_state: A (any kind of picklable) object representing the
             initial :attr:`user_state`.
-        action (function): A function which is called to update
+        action (callable): A callable which is called to update
             :attr:`user_state` every time the trigger is triggered.
         threshold_steps (int): Number of steps it should trigger after the best
             value was last updated.
         minimum_interval_steps (int): Minimum number of steps between twice
             firing of the trigger.
-        default (optional): The value returned after :attr:`action` exhausted.
-            If not provided, the trigger will do nothing when `StopIteration`
-            occurs.
     .. document private functions
     .. automethod:: __call__
     """
@@ -281,7 +281,7 @@ def __call__(self, step, score):
         Args:
             step (int): Current training step to update. The training step must
                 be updated in ascending order.
-            score (float): Current value of the maintained metric.
+            score (float or int): Current value of the maintained metric.
 
         Returns:
             A boolean denotes whether triggered this time.

From 8fdf62ee9d7f54c6fe571598225eafc5e7a532ad Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Mon, 5 Nov 2018 21:44:12 -0500
Subject: [PATCH 64/65] remove unfinished MovingAverageConvergenceTrigger

---
 texar/utils/triggers.py | 42 -----------------------------------------
 1 file changed, 42 deletions(-)

diff --git a/texar/utils/triggers.py b/texar/utils/triggers.py
index ee50c341..d4aefdaf 100644
--- a/texar/utils/triggers.py
+++ b/texar/utils/triggers.py
@@ -310,45 +310,3 @@ def best_ever_score(self):
         """The best-ever score.
         """
         return self._best_ever_score
-
-
-class MovingAverageConvergenceTrigger(Trigger):
-
-    def __init__(self, initial_user_state, action, n, threshold,
-                 minimum_interval_steps):
-        super(MovingAverageConvergenceTrigger, self).__init__(
-            initial_user_state, action)
-        self._n = n
-        self._threshold = threshold
-        self._minimum_interval_steps = minimum_interval_steps
-        self._last_triggered_step = None
-        self._head_queue = queue.Queue(self._n)
-        self._head_sum = 0
-        self._rear_queue = queue.Queue(self._n)
-        self._rear_sum = 0
-
-    def _predicate(self, step, score):
-        if self._head_queue.full():
-            e = self._head_queue.get()
-            self._head_sum -= e
-            if self._rear_queue.full():
-                self._rear_sum -= self._rear_queue.get()
-            self._rear_queue.put(e)
-            self._rear_sum += e
-        self._head_queue.put(score)
-        self._head_sum += score
-
-        if (self._last_triggered_step is None or
-                step - self._last_triggered_step
-                >= self._minimum_interval_steps) and \
-                self._head_queue.full() and self._rear_queue.full() and \
-                self._head_sum - self._rear_sum <= self._n * self._threshold:
-            self._last_triggered_step = step
-            return True
-        return False
-
-    @property
-    def _state_names(self):
-        return super(BestEverConvergenceTrigger, self)._state_names + [
-            '_last_triggered_step', '_head_queue', '_head_sum', '_rear_queue',
-            '_rear_sum']

From 3b588830c15d3161d4833a2618aec9d934abe7bf Mon Sep 17 00:00:00 2001
From: wwt <wwt.cpp@gmail.com>
Date: Mon, 5 Nov 2018 22:23:25 -0500
Subject: [PATCH 65/65] update README.md

---
 .../differentiable_expected_bleu/README.md    | 27 ++++++++++++++-----
 .../differentiable_expected_bleu.py           |  2 +-
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/examples/differentiable_expected_bleu/README.md b/examples/differentiable_expected_bleu/README.md
index ad5d685c..5bff077d 100644
--- a/examples/differentiable_expected_bleu/README.md
+++ b/examples/differentiable_expected_bleu/README.md
@@ -8,7 +8,7 @@ This example builds an attentional seq2seq model for machine translation trained
 
 Download the data with the following cmds:
 
-```
+```bash
 python prepare_data.py --data de-en
 ```
 
@@ -16,18 +16,33 @@ python prepare_data.py --data de-en
 
 Train the model with the following cmd:
 
-```
-python differentiable_expected_bleu.py --config_model config_model --config_data config_iwslt14_de-en --config_train config_train_iwslt14_de-en --pretrain_epochs 8
+```bash
+python differentiable_expected_bleu.py --config_model config_model_medium --config_data config_data_iwslt14_de-en --config_train config_train --expr_name iwslt14_de-en --restore_from "" --reinitialize
 ```
 
 Here:
   * `--config_model` specifies the model config. Note not to include the `.py` suffix.
   * `--config_data` specifies the data config.
   * `--config_train` specifies the training config.
-  * `--pretrain_epochs` specifies the number of epochs to pretrain with cross-entropy loss.
+  * `--expr_name` specifies the experiment name. Used as the directory name to save and restore all information.
+  * `--restore_from` specifies the checkpoint path to restore from. If not specified (or an empty string is specified), the latest checkpoint in `expr_name` is restored.
+  * `--reinitialize` is a flag indicates whether to reinitialize the state of the optimizers before training and after annealing. Default is enabled.
+
+[config_model_medium.py](./config_model_medium.py) specifies a single-layer seq2seq model with Luong attention and bi-directional RNN encoder.
 
-[config_model.py](./config_model.py) specifies a single-layer seq2seq model with Luong attention and bi-directional RNN encoder. Hyperparameters taking default values can be omitted from the config file. 
+[config_model_large.py](./config_model_large.py) specifies a seq2seq model with Luong attention, 2-layer bi-directional RNN encoder, single-layer RNN decoder, and a connector between the final state of the encoder and the initial state of the decoder. The size of this model is quite large.
+
+[config_data_iwslt14_de-en.py](./config_data_iwslt14_de-en.py) specifies the IWSLT'14 German-English dataset.
+
+[config_train.py](./config_train.py) specifies the training (including annealing) configs.
 
 ## Results ##
 
-On the IWSLT14 dataset, the model achieves `BLEU=25.35` after annealed all masks, while the cross-entropy trained model achieves `BLEU=24.57`.
+On the IWSLT'14 German-English dataset, we ran both configs for 4~5 times. Here are the average BLEU scores attained:
+
+|                       config                       | inference beam size | Cross-Entropy baseline | DEBLEU | improvement |
+| :------------------------------------------------: | :-----------------: | :--------------------: | :----: | :---------: |
+| [config_model_medium.py](./config_model_medium.py) |          1          |         26.12          | 27.40  |    1.28     |
+| [config_model_medium.py](./config_model_medium.py) |          5          |         27.03          | 27.72  |    0.70     |
+|  [config_model_large.py](./config_model_large.py)  |          1          |         25.24          | 26.47  |    1.23     |
+|  [config_model_large.py](./config_model_large.py)  |          5          |         26.33          | 26.87  |    0.54     |
diff --git a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
index ac6573b0..0c414b21 100755
--- a/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
+++ b/examples/differentiable_expected_bleu/differentiable_expected_bleu.py
@@ -39,7 +39,7 @@
                     "restore from. If not specified, the latest checkpoint in "
                     "expr_name is restored.")
 flags.DEFINE_boolean("reinitialize", True, "Whether to reinitialize the state "
-                     "of the optimizers before training and after triggering.")
+                     "of the optimizers before training and after annealing.")
 
 FLAGS = flags.FLAGS